{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:15:48.006519', 'step': 0, 'epoch': 0} {'type': 'pplx', 'content': 68335452.534585, 'timestamp': '2025-09-15 03:15:48.008878', 'step': 0, 'epoch': 0} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:15:48.089573', 'step': 0, 'epoch': 1} {'type': 'loss', 'content': 0.3317909240722656, 'timestamp': '2025-09-15 03:15:48.091491', 'step': 1, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:15:48.138774', 'step': 1, 'epoch': 1} {'type': 'loss', 'content': 0.3635498881340027, 'timestamp': '2025-09-15 03:15:48.140818', 'step': 2, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:15:48.189865', 'step': 2, 'epoch': 1} {'type': 'loss', 'content': 0.32277998328208923, 'timestamp': '2025-09-15 03:15:48.192485', 'step': 3, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:15:48.223345', 'step': 3, 'epoch': 1} {'type': 'loss', 'content': 0.38533610105514526, 'timestamp': '2025-09-15 03:15:48.305210', 'step': 4, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:15:48.351018', 'step': 4, 'epoch': 1} {'type': 'loss', 'content': 0.10644926130771637, 'timestamp': '2025-09-15 03:15:48.353187', 'step': 5, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:15:48.390429', 'step': 5, 'epoch': 1} {'type': 'loss', 'content': 0.13102366030216217, 'timestamp': '2025-09-15 03:15:48.392650', 'step': 6, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:15:48.437578', 'step': 6, 'epoch': 1} {'type': 'loss', 'content': 0.03480641916394234, 'timestamp': '2025-09-15 03:15:48.439872', 'step': 7, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:15:48.470535', 'step': 7, 'epoch': 1} {'type': 'loss', 'content': 0.005003761500120163, 'timestamp': '2025-09-15 03:15:48.496060', 'step': 8, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:15:48.525945', 'step': 8, 'epoch': 1} {'type': 'loss', 'content': 0.01741660200059414, 'timestamp': '2025-09-15 03:15:48.528264', 'step': 9, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:15:48.558378', 'step': 9, 'epoch': 1} {'type': 'loss', 'content': 0.04518657922744751, 'timestamp': '2025-09-15 03:15:48.560476', 'step': 10, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:15:48.590824', 'step': 10, 'epoch': 1} {'type': 'loss', 'content': 0.027425434440374374, 'timestamp': '2025-09-15 03:15:48.598569', 'step': 11, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:15:48.628755', 'step': 11, 'epoch': 1} {'type': 'loss', 'content': 0.04769379645586014, 'timestamp': '2025-09-15 03:15:48.653997', 'step': 12, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:15:48.684250', 'step': 12, 'epoch': 1} {'type': 'loss', 'content': 0.05360250547528267, 'timestamp': '2025-09-15 03:15:48.686236', 'step': 13, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:15:48.722020', 'step': 13, 'epoch': 1} {'type': 'loss', 'content': 0.00701676681637764, 'timestamp': '2025-09-15 03:15:48.724093', 'step': 14, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:15:48.754567', 'step': 14, 'epoch': 1} {'type': 'loss', 'content': 0.03688133880496025, 'timestamp': '2025-09-15 03:15:48.758675', 'step': 15, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:15:48.789361', 'step': 15, 'epoch': 1} {'type': 'loss', 'content': 0.02347908914089203, 'timestamp': '2025-09-15 03:15:48.813142', 'step': 16, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:15:48.850146', 'step': 16, 'epoch': 1} {'type': 'loss', 'content': 0.02019873820245266, 'timestamp': '2025-09-15 03:15:48.852570', 'step': 17, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:15:48.912330', 'step': 17, 'epoch': 1} {'type': 'loss', 'content': 0.044917989522218704, 'timestamp': '2025-09-15 03:15:48.916388', 'step': 18, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:15:48.946767', 'step': 18, 'epoch': 1} {'type': 'loss', 'content': 0.041808683425188065, 'timestamp': '2025-09-15 03:15:48.949124', 'step': 19, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:15:48.979593', 'step': 19, 'epoch': 1} {'type': 'loss', 'content': 0.04711863398551941, 'timestamp': '2025-09-15 03:15:49.007320', 'step': 20, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:15:49.037809', 'step': 20, 'epoch': 1} {'type': 'loss', 'content': 0.025185493752360344, 'timestamp': '2025-09-15 03:15:49.042527', 'step': 21, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:15:49.072760', 'step': 21, 'epoch': 1} {'type': 'loss', 'content': 0.02557338774204254, 'timestamp': '2025-09-15 03:15:49.076364', 'step': 22, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:15:49.107336', 'step': 22, 'epoch': 1} {'type': 'loss', 'content': 0.027826517820358276, 'timestamp': '2025-09-15 03:15:49.109417', 'step': 23, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:15:49.139561', 'step': 23, 'epoch': 1} {'type': 'loss', 'content': 0.026328999549150467, 'timestamp': '2025-09-15 03:15:49.165029', 'step': 24, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:15:49.195700', 'step': 24, 'epoch': 1} {'type': 'loss', 'content': 0.026946408674120903, 'timestamp': '2025-09-15 03:15:49.200291', 'step': 25, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-15 03:15:49.236144', 'step': 25, 'epoch': 1} {'type': 'loss', 'content': 0.03108673170208931, 'timestamp': '2025-09-15 03:15:49.240802', 'step': 26, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:15:49.271162', 'step': 26, 'epoch': 1} {'type': 'loss', 'content': 0.0360201820731163, 'timestamp': '2025-09-15 03:15:49.275116', 'step': 27, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:15:49.319835', 'step': 27, 'epoch': 1} {'type': 'loss', 'content': 0.033217381685972214, 'timestamp': '2025-09-15 03:15:49.343514', 'step': 28, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:15:49.384305', 'step': 28, 'epoch': 1} {'type': 'loss', 'content': 0.026532070711255074, 'timestamp': '2025-09-15 03:15:49.386407', 'step': 29, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:15:49.417594', 'step': 29, 'epoch': 1} {'type': 'loss', 'content': 0.02795436605811119, 'timestamp': '2025-09-15 03:15:49.419573', 'step': 30, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:15:49.449585', 'step': 30, 'epoch': 1} {'type': 'loss', 'content': 0.032100606709718704, 'timestamp': '2025-09-15 03:15:49.453778', 'step': 31, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:15:49.485188', 'step': 31, 'epoch': 1} {'type': 'loss', 'content': 0.027776723727583885, 'timestamp': '2025-09-15 03:15:49.509034', 'step': 32, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:15:49.539707', 'step': 32, 'epoch': 1} {'type': 'loss', 'content': 0.01872324012219906, 'timestamp': '2025-09-15 03:15:49.544386', 'step': 33, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:15:49.574398', 'step': 33, 'epoch': 1} {'type': 'loss', 'content': 0.020866069942712784, 'timestamp': '2025-09-15 03:15:49.581423', 'step': 34, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:15:49.613794', 'step': 34, 'epoch': 1} {'type': 'loss', 'content': 0.035089291632175446, 'timestamp': '2025-09-15 03:15:49.618321', 'step': 35, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:15:49.648605', 'step': 35, 'epoch': 1} {'type': 'loss', 'content': 0.03352435305714607, 'timestamp': '2025-09-15 03:15:49.672343', 'step': 36, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:15:49.702206', 'step': 36, 'epoch': 1} {'type': 'loss', 'content': 0.025466276332736015, 'timestamp': '2025-09-15 03:15:49.704640', 'step': 37, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:15:49.734735', 'step': 37, 'epoch': 1} {'type': 'loss', 'content': 0.032446447759866714, 'timestamp': '2025-09-15 03:15:49.737475', 'step': 38, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:15:49.767083', 'step': 38, 'epoch': 1} {'type': 'loss', 'content': 0.022059399634599686, 'timestamp': '2025-09-15 03:15:49.769885', 'step': 39, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:15:50.358852', 'step': 39, 'epoch': 1} {'type': 'pplx', 'content': 52920853.06323647, 'timestamp': '2025-09-15 03:15:50.360958', 'step': 39, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:15:50.390249', 'step': 39, 'epoch': 1} {'type': 'loss', 'content': 0.021340548992156982, 'timestamp': '2025-09-15 03:15:50.415440', 'step': 40, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:15:50.446315', 'step': 40, 'epoch': 1} {'type': 'loss', 'content': 0.02679264359176159, 'timestamp': '2025-09-15 03:15:50.450724', 'step': 41, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:15:50.481270', 'step': 41, 'epoch': 1} {'type': 'loss', 'content': 0.02252059243619442, 'timestamp': '2025-09-15 03:15:50.485813', 'step': 42, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:15:50.515860', 'step': 42, 'epoch': 1} {'type': 'loss', 'content': 0.021507279947400093, 'timestamp': '2025-09-15 03:15:50.518693', 'step': 43, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:15:50.549161', 'step': 43, 'epoch': 1} {'type': 'loss', 'content': 0.022949328646063805, 'timestamp': '2025-09-15 03:15:50.577297', 'step': 44, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:15:50.607047', 'step': 44, 'epoch': 1} {'type': 'loss', 'content': 0.024059386923909187, 'timestamp': '2025-09-15 03:15:50.609358', 'step': 45, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:15:50.640049', 'step': 45, 'epoch': 1} {'type': 'loss', 'content': 0.024520207196474075, 'timestamp': '2025-09-15 03:15:50.644798', 'step': 46, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:15:50.675390', 'step': 46, 'epoch': 1} {'type': 'loss', 'content': 0.02677883766591549, 'timestamp': '2025-09-15 03:15:50.677449', 'step': 47, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:15:50.707369', 'step': 47, 'epoch': 1} {'type': 'loss', 'content': 0.02447492815554142, 'timestamp': '2025-09-15 03:15:50.731043', 'step': 48, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:15:50.761421', 'step': 48, 'epoch': 1} {'type': 'loss', 'content': 0.023482685908675194, 'timestamp': '2025-09-15 03:15:50.763530', 'step': 49, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:15:50.793696', 'step': 49, 'epoch': 1} {'type': 'loss', 'content': 0.02031794749200344, 'timestamp': '2025-09-15 03:15:50.795850', 'step': 50, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:15:50.828078', 'step': 50, 'epoch': 1} {'type': 'loss', 'content': 0.02589653804898262, 'timestamp': '2025-09-15 03:15:50.829859', 'step': 51, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:15:50.860142', 'step': 51, 'epoch': 1} {'type': 'loss', 'content': 0.02244141697883606, 'timestamp': '2025-09-15 03:15:50.883652', 'step': 52, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:15:50.913689', 'step': 52, 'epoch': 1} {'type': 'loss', 'content': 0.03502468392252922, 'timestamp': '2025-09-15 03:15:50.915999', 'step': 53, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:15:50.946182', 'step': 53, 'epoch': 1} {'type': 'loss', 'content': 0.031696025282144547, 'timestamp': '2025-09-15 03:15:50.948185', 'step': 54, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:15:50.978060', 'step': 54, 'epoch': 1} {'type': 'loss', 'content': 0.03126489743590355, 'timestamp': '2025-09-15 03:15:50.982354', 'step': 55, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:15:51.012793', 'step': 55, 'epoch': 1} {'type': 'loss', 'content': 0.025056706741452217, 'timestamp': '2025-09-15 03:15:51.036458', 'step': 56, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:15:51.066709', 'step': 56, 'epoch': 1} {'type': 'loss', 'content': 0.025800740346312523, 'timestamp': '2025-09-15 03:15:51.068884', 'step': 57, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:15:51.098933', 'step': 57, 'epoch': 1} {'type': 'loss', 'content': 0.036419469863176346, 'timestamp': '2025-09-15 03:15:51.102043', 'step': 58, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:15:51.133067', 'step': 58, 'epoch': 1} {'type': 'loss', 'content': 0.026494473218917847, 'timestamp': '2025-09-15 03:15:51.135509', 'step': 59, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:15:51.165529', 'step': 59, 'epoch': 1} {'type': 'loss', 'content': 0.02609933726489544, 'timestamp': '2025-09-15 03:15:51.189030', 'step': 60, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:15:51.219232', 'step': 60, 'epoch': 1} {'type': 'loss', 'content': 0.02513437159359455, 'timestamp': '2025-09-15 03:15:51.221345', 'step': 61, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:15:51.251723', 'step': 61, 'epoch': 1} {'type': 'loss', 'content': 0.016333062201738358, 'timestamp': '2025-09-15 03:15:51.255883', 'step': 62, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:15:51.285617', 'step': 62, 'epoch': 1} {'type': 'loss', 'content': 0.02447478286921978, 'timestamp': '2025-09-15 03:15:51.287806', 'step': 63, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:15:51.320461', 'step': 63, 'epoch': 1} {'type': 'loss', 'content': 0.02058406174182892, 'timestamp': '2025-09-15 03:15:51.344363', 'step': 64, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:15:51.374879', 'step': 64, 'epoch': 1} {'type': 'loss', 'content': 0.0208949763327837, 'timestamp': '2025-09-15 03:15:51.377047', 'step': 65, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:15:51.407319', 'step': 65, 'epoch': 1} {'type': 'loss', 'content': 0.02405490167438984, 'timestamp': '2025-09-15 03:15:51.410066', 'step': 66, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:15:51.440150', 'step': 66, 'epoch': 1} {'type': 'loss', 'content': 0.023688603192567825, 'timestamp': '2025-09-15 03:15:51.444218', 'step': 67, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:15:51.474348', 'step': 67, 'epoch': 1} {'type': 'loss', 'content': 0.017248744145035744, 'timestamp': '2025-09-15 03:15:51.497879', 'step': 68, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:15:51.528228', 'step': 68, 'epoch': 1} {'type': 'loss', 'content': 0.02224724553525448, 'timestamp': '2025-09-15 03:15:51.530547', 'step': 69, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:15:51.560979', 'step': 69, 'epoch': 1} {'type': 'loss', 'content': 0.02748028375208378, 'timestamp': '2025-09-15 03:15:51.563031', 'step': 70, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:15:51.592871', 'step': 70, 'epoch': 1} {'type': 'loss', 'content': 0.020563840866088867, 'timestamp': '2025-09-15 03:15:51.595648', 'step': 71, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:15:51.625709', 'step': 71, 'epoch': 1} {'type': 'loss', 'content': 0.02229272946715355, 'timestamp': '2025-09-15 03:15:51.649493', 'step': 72, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:15:51.679161', 'step': 72, 'epoch': 1} {'type': 'loss', 'content': 0.026734307408332825, 'timestamp': '2025-09-15 03:15:51.681088', 'step': 73, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:15:51.710478', 'step': 73, 'epoch': 1} {'type': 'loss', 'content': 0.025780189782381058, 'timestamp': '2025-09-15 03:15:51.713035', 'step': 74, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:15:51.742738', 'step': 74, 'epoch': 1} {'type': 'loss', 'content': 0.026472801342606544, 'timestamp': '2025-09-15 03:15:51.744788', 'step': 75, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:15:51.775551', 'step': 75, 'epoch': 1} {'type': 'loss', 'content': 0.032414261251688004, 'timestamp': '2025-09-15 03:15:51.801191', 'step': 76, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:15:51.831424', 'step': 76, 'epoch': 1} {'type': 'loss', 'content': 0.021320512518286705, 'timestamp': '2025-09-15 03:15:51.833418', 'step': 77, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:15:51.863813', 'step': 77, 'epoch': 1} {'type': 'loss', 'content': 0.021593904122710228, 'timestamp': '2025-09-15 03:15:51.870660', 'step': 78, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:15:52.458394', 'step': 78, 'epoch': 1} {'type': 'pplx', 'content': 58913638.63486896, 'timestamp': '2025-09-15 03:15:52.460527', 'step': 78, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:15:52.490632', 'step': 78, 'epoch': 1} {'type': 'loss', 'content': 0.02848268486559391, 'timestamp': '2025-09-15 03:15:52.497856', 'step': 79, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:15:52.528459', 'step': 79, 'epoch': 1} {'type': 'loss', 'content': 0.023239096626639366, 'timestamp': '2025-09-15 03:15:52.553583', 'step': 80, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:15:52.584198', 'step': 80, 'epoch': 1} {'type': 'loss', 'content': 0.021547870710492134, 'timestamp': '2025-09-15 03:15:52.588513', 'step': 81, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:15:52.618909', 'step': 81, 'epoch': 1} {'type': 'loss', 'content': 0.032910946756601334, 'timestamp': '2025-09-15 03:15:52.623547', 'step': 82, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:15:52.654135', 'step': 82, 'epoch': 1} {'type': 'loss', 'content': 0.030047548934817314, 'timestamp': '2025-09-15 03:15:52.661792', 'step': 83, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:15:52.693315', 'step': 83, 'epoch': 1} {'type': 'loss', 'content': 0.02654549479484558, 'timestamp': '2025-09-15 03:15:52.718488', 'step': 84, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:15:52.751343', 'step': 84, 'epoch': 1} {'type': 'loss', 'content': 0.020856261253356934, 'timestamp': '2025-09-15 03:15:52.756220', 'step': 85, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:15:52.786161', 'step': 85, 'epoch': 1} {'type': 'loss', 'content': 0.02457808144390583, 'timestamp': '2025-09-15 03:15:52.788914', 'step': 86, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:15:52.818915', 'step': 86, 'epoch': 1} {'type': 'loss', 'content': 0.024185696616768837, 'timestamp': '2025-09-15 03:15:52.826735', 'step': 87, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:15:52.856352', 'step': 87, 'epoch': 1} {'type': 'loss', 'content': 0.02541220188140869, 'timestamp': '2025-09-15 03:15:52.879910', 'step': 88, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:15:52.910414', 'step': 88, 'epoch': 1} {'type': 'loss', 'content': 0.026249513030052185, 'timestamp': '2025-09-15 03:15:52.915050', 'step': 89, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:15:52.945484', 'step': 89, 'epoch': 1} {'type': 'loss', 'content': 0.02385100908577442, 'timestamp': '2025-09-15 03:15:52.948291', 'step': 90, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:15:52.978939', 'step': 90, 'epoch': 1} {'type': 'loss', 'content': 0.024936696514487267, 'timestamp': '2025-09-15 03:15:52.981821', 'step': 91, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:15:53.011926', 'step': 91, 'epoch': 1} {'type': 'loss', 'content': 0.024910280480980873, 'timestamp': '2025-09-15 03:15:53.037598', 'step': 92, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:15:53.068317', 'step': 92, 'epoch': 1} {'type': 'loss', 'content': 0.02396443299949169, 'timestamp': '2025-09-15 03:15:53.070289', 'step': 93, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:15:53.100698', 'step': 93, 'epoch': 1} {'type': 'loss', 'content': 0.025535132735967636, 'timestamp': '2025-09-15 03:15:53.105185', 'step': 94, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:15:53.134928', 'step': 94, 'epoch': 1} {'type': 'loss', 'content': 0.023879891261458397, 'timestamp': '2025-09-15 03:15:53.139811', 'step': 95, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:15:53.170030', 'step': 95, 'epoch': 1} {'type': 'loss', 'content': 0.024169299751520157, 'timestamp': '2025-09-15 03:15:53.195202', 'step': 96, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:15:53.224970', 'step': 96, 'epoch': 1} {'type': 'loss', 'content': 0.0213132593780756, 'timestamp': '2025-09-15 03:15:53.228309', 'step': 97, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:15:53.258572', 'step': 97, 'epoch': 1} {'type': 'loss', 'content': 0.024025261402130127, 'timestamp': '2025-09-15 03:15:53.260512', 'step': 98, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:15:53.292982', 'step': 98, 'epoch': 1} {'type': 'loss', 'content': 0.02572092041373253, 'timestamp': '2025-09-15 03:15:53.300802', 'step': 99, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:15:53.330467', 'step': 99, 'epoch': 1} {'type': 'loss', 'content': 0.024027112871408463, 'timestamp': '2025-09-15 03:15:53.354365', 'step': 100, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:15:53.384505', 'step': 100, 'epoch': 1} {'type': 'loss', 'content': 0.025775250047445297, 'timestamp': '2025-09-15 03:15:53.389232', 'step': 101, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:15:53.419685', 'step': 101, 'epoch': 1} {'type': 'loss', 'content': 0.027797836810350418, 'timestamp': '2025-09-15 03:15:53.424118', 'step': 102, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:15:53.453998', 'step': 102, 'epoch': 1} {'type': 'loss', 'content': 0.02851773239672184, 'timestamp': '2025-09-15 03:15:53.456892', 'step': 103, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:15:53.486813', 'step': 103, 'epoch': 1} {'type': 'loss', 'content': 0.022648418322205544, 'timestamp': '2025-09-15 03:15:53.512430', 'step': 104, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-15 03:15:53.547607', 'step': 104, 'epoch': 1} {'type': 'loss', 'content': 0.029527029022574425, 'timestamp': '2025-09-15 03:15:53.553297', 'step': 105, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:15:53.583034', 'step': 105, 'epoch': 1} {'type': 'loss', 'content': 0.026798781007528305, 'timestamp': '2025-09-15 03:15:53.585021', 'step': 106, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:15:53.615159', 'step': 106, 'epoch': 1} {'type': 'loss', 'content': 0.027438562363386154, 'timestamp': '2025-09-15 03:15:53.619289', 'step': 107, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:15:53.649355', 'step': 107, 'epoch': 1} {'type': 'loss', 'content': 0.026652364060282707, 'timestamp': '2025-09-15 03:15:53.672826', 'step': 108, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:15:53.702413', 'step': 108, 'epoch': 1} {'type': 'loss', 'content': 0.019344601780176163, 'timestamp': '2025-09-15 03:15:53.704348', 'step': 109, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:15:53.734672', 'step': 109, 'epoch': 1} {'type': 'loss', 'content': 0.029362743720412254, 'timestamp': '2025-09-15 03:15:53.738825', 'step': 110, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:15:53.769329', 'step': 110, 'epoch': 1} {'type': 'loss', 'content': 0.02442595362663269, 'timestamp': '2025-09-15 03:15:53.773998', 'step': 111, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:15:53.803955', 'step': 111, 'epoch': 1} {'type': 'loss', 'content': 0.024722302332520485, 'timestamp': '2025-09-15 03:15:53.827441', 'step': 112, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:15:53.857473', 'step': 112, 'epoch': 1} {'type': 'loss', 'content': 0.024500925093889236, 'timestamp': '2025-09-15 03:15:53.859457', 'step': 113, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:15:53.889697', 'step': 113, 'epoch': 1} {'type': 'loss', 'content': 0.022203342989087105, 'timestamp': '2025-09-15 03:15:53.897301', 'step': 114, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:15:53.927337', 'step': 114, 'epoch': 1} {'type': 'loss', 'content': 0.027586471289396286, 'timestamp': '2025-09-15 03:15:53.929333', 'step': 115, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:15:53.959884', 'step': 115, 'epoch': 1} {'type': 'loss', 'content': 0.027563441544771194, 'timestamp': '2025-09-15 03:15:53.985276', 'step': 116, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:15:54.015606', 'step': 116, 'epoch': 1} {'type': 'loss', 'content': 0.018406938761472702, 'timestamp': '2025-09-15 03:15:54.017604', 'step': 117, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:15:54.612600', 'step': 117, 'epoch': 1} {'type': 'pplx', 'content': 59109284.580648854, 'timestamp': '2025-09-15 03:15:54.614935', 'step': 117, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:15:54.644023', 'step': 117, 'epoch': 1} {'type': 'loss', 'content': 0.022183803841471672, 'timestamp': '2025-09-15 03:15:54.650679', 'step': 118, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:15:54.680911', 'step': 118, 'epoch': 1} {'type': 'loss', 'content': 0.02371959201991558, 'timestamp': '2025-09-15 03:15:54.685557', 'step': 119, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-15 03:15:54.726201', 'step': 119, 'epoch': 1} {'type': 'loss', 'content': 0.024669630452990532, 'timestamp': '2025-09-15 03:15:54.753551', 'step': 120, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:15:54.784220', 'step': 120, 'epoch': 1} {'type': 'loss', 'content': 0.025222482159733772, 'timestamp': '2025-09-15 03:15:54.787092', 'step': 121, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:15:54.817780', 'step': 121, 'epoch': 1} {'type': 'loss', 'content': 0.023705070838332176, 'timestamp': '2025-09-15 03:15:54.819784', 'step': 122, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:15:54.850149', 'step': 122, 'epoch': 1} {'type': 'loss', 'content': 0.02747977338731289, 'timestamp': '2025-09-15 03:15:54.854848', 'step': 123, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:15:54.885557', 'step': 123, 'epoch': 1} {'type': 'loss', 'content': 0.028290854766964912, 'timestamp': '2025-09-15 03:15:54.913617', 'step': 124, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:15:54.943352', 'step': 124, 'epoch': 1} {'type': 'loss', 'content': 0.023249467834830284, 'timestamp': '2025-09-15 03:15:54.945356', 'step': 125, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:15:54.975479', 'step': 125, 'epoch': 1} {'type': 'loss', 'content': 0.020255951210856438, 'timestamp': '2025-09-15 03:15:54.979941', 'step': 126, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:15:55.010679', 'step': 126, 'epoch': 1} {'type': 'loss', 'content': 0.024496247991919518, 'timestamp': '2025-09-15 03:15:55.013241', 'step': 127, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:15:55.042930', 'step': 127, 'epoch': 1} {'type': 'loss', 'content': 0.02648274041712284, 'timestamp': '2025-09-15 03:15:55.066591', 'step': 128, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:15:55.097421', 'step': 128, 'epoch': 1} {'type': 'loss', 'content': 0.025107769295573235, 'timestamp': '2025-09-15 03:15:55.099702', 'step': 129, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-15 03:15:55.145696', 'step': 129, 'epoch': 1} {'type': 'loss', 'content': 0.025274476036429405, 'timestamp': '2025-09-15 03:15:55.151458', 'step': 130, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:15:55.182612', 'step': 130, 'epoch': 1} {'type': 'loss', 'content': 0.023525172844529152, 'timestamp': '2025-09-15 03:15:55.186745', 'step': 131, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:15:55.216811', 'step': 131, 'epoch': 1} {'type': 'loss', 'content': 0.021011391654610634, 'timestamp': '2025-09-15 03:15:55.240430', 'step': 132, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:15:55.270318', 'step': 132, 'epoch': 1} {'type': 'loss', 'content': 0.024462955072522163, 'timestamp': '2025-09-15 03:15:55.272363', 'step': 133, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:15:55.302730', 'step': 133, 'epoch': 1} {'type': 'loss', 'content': 0.02858763374388218, 'timestamp': '2025-09-15 03:15:55.309554', 'step': 134, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:15:55.339946', 'step': 134, 'epoch': 1} {'type': 'loss', 'content': 0.02398851327598095, 'timestamp': '2025-09-15 03:15:55.347655', 'step': 135, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:15:55.378900', 'step': 135, 'epoch': 1} {'type': 'loss', 'content': 0.024971574544906616, 'timestamp': '2025-09-15 03:15:55.404070', 'step': 136, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:15:55.436106', 'step': 136, 'epoch': 1} {'type': 'loss', 'content': 0.02387789450585842, 'timestamp': '2025-09-15 03:15:55.438113', 'step': 137, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:15:55.467922', 'step': 137, 'epoch': 1} {'type': 'loss', 'content': 0.023335954174399376, 'timestamp': '2025-09-15 03:15:55.470408', 'step': 138, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:15:55.500276', 'step': 138, 'epoch': 1} {'type': 'loss', 'content': 0.02436818554997444, 'timestamp': '2025-09-15 03:15:55.505054', 'step': 139, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:15:55.535284', 'step': 139, 'epoch': 1} {'type': 'loss', 'content': 0.024393929168581963, 'timestamp': '2025-09-15 03:15:55.558851', 'step': 140, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:15:55.590050', 'step': 140, 'epoch': 1} {'type': 'loss', 'content': 0.02338075265288353, 'timestamp': '2025-09-15 03:15:55.591988', 'step': 141, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-15 03:15:55.622511', 'step': 141, 'epoch': 1} {'type': 'loss', 'content': 0.02519969828426838, 'timestamp': '2025-09-15 03:15:55.630444', 'step': 142, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:15:55.660561', 'step': 142, 'epoch': 1} {'type': 'loss', 'content': 0.022787822410464287, 'timestamp': '2025-09-15 03:15:55.662793', 'step': 143, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:15:55.692882', 'step': 143, 'epoch': 1} {'type': 'loss', 'content': 0.02562013640999794, 'timestamp': '2025-09-15 03:15:55.716477', 'step': 144, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:15:55.746652', 'step': 144, 'epoch': 1} {'type': 'loss', 'content': 0.027317887172102928, 'timestamp': '2025-09-15 03:15:55.748648', 'step': 145, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:15:55.778678', 'step': 145, 'epoch': 1} {'type': 'loss', 'content': 0.021526094526052475, 'timestamp': '2025-09-15 03:15:55.780818', 'step': 146, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:15:55.810725', 'step': 146, 'epoch': 1} {'type': 'loss', 'content': 0.025312988087534904, 'timestamp': '2025-09-15 03:15:55.812908', 'step': 147, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:15:55.843838', 'step': 147, 'epoch': 1} {'type': 'loss', 'content': 0.023540545254945755, 'timestamp': '2025-09-15 03:15:55.869018', 'step': 148, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:15:55.898872', 'step': 148, 'epoch': 1} {'type': 'loss', 'content': 0.02694949321448803, 'timestamp': '2025-09-15 03:15:55.900801', 'step': 149, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:15:55.930984', 'step': 149, 'epoch': 1} {'type': 'loss', 'content': 0.023923378437757492, 'timestamp': '2025-09-15 03:15:55.935684', 'step': 150, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:15:55.965886', 'step': 150, 'epoch': 1} {'type': 'loss', 'content': 0.02161807008087635, 'timestamp': '2025-09-15 03:15:55.968069', 'step': 151, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:15:55.998458', 'step': 151, 'epoch': 1} {'type': 'loss', 'content': 0.021433347836136818, 'timestamp': '2025-09-15 03:15:56.022123', 'step': 152, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:15:56.052080', 'step': 152, 'epoch': 1} {'type': 'loss', 'content': 0.019754543900489807, 'timestamp': '2025-09-15 03:15:56.054142', 'step': 153, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:15:56.090431', 'step': 153, 'epoch': 1} {'type': 'loss', 'content': 0.028470566496253014, 'timestamp': '2025-09-15 03:15:56.094495', 'step': 154, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:15:56.124892', 'step': 154, 'epoch': 1} {'type': 'loss', 'content': 0.022595848888158798, 'timestamp': '2025-09-15 03:15:56.128736', 'step': 155, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:15:56.159437', 'step': 155, 'epoch': 1} {'type': 'loss', 'content': 0.024241095408797264, 'timestamp': '2025-09-15 03:15:56.187115', 'step': 156, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:15:56.773873', 'step': 156, 'epoch': 1} {'type': 'pplx', 'content': 61359476.46442279, 'timestamp': '2025-09-15 03:15:56.775762', 'step': 156, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:15:56.803488', 'step': 156, 'epoch': 1} {'type': 'loss', 'content': 0.023162925615906715, 'timestamp': '2025-09-15 03:15:56.805798', 'step': 157, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:15:56.836621', 'step': 157, 'epoch': 1} {'type': 'loss', 'content': 0.02335996739566326, 'timestamp': '2025-09-15 03:15:56.840980', 'step': 158, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:15:56.871586', 'step': 158, 'epoch': 1} {'type': 'loss', 'content': 0.018901418894529343, 'timestamp': '2025-09-15 03:15:56.874769', 'step': 159, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:15:56.907194', 'step': 159, 'epoch': 1} {'type': 'loss', 'content': 0.027393018826842308, 'timestamp': '2025-09-15 03:15:56.934763', 'step': 160, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:15:56.965969', 'step': 160, 'epoch': 1} {'type': 'loss', 'content': 0.027207305654883385, 'timestamp': '2025-09-15 03:15:56.968080', 'step': 161, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:15:56.998453', 'step': 161, 'epoch': 1} {'type': 'loss', 'content': 0.019821634516119957, 'timestamp': '2025-09-15 03:15:57.000674', 'step': 162, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:15:57.031445', 'step': 162, 'epoch': 1} {'type': 'loss', 'content': 0.0335393063724041, 'timestamp': '2025-09-15 03:15:57.038628', 'step': 163, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:15:57.070562', 'step': 163, 'epoch': 1} {'type': 'loss', 'content': 0.025750983506441116, 'timestamp': '2025-09-15 03:15:57.095618', 'step': 164, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:15:57.126733', 'step': 164, 'epoch': 1} {'type': 'loss', 'content': 0.021948251873254776, 'timestamp': '2025-09-15 03:15:57.128873', 'step': 165, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:15:57.159881', 'step': 165, 'epoch': 1} {'type': 'loss', 'content': 0.028559934347867966, 'timestamp': '2025-09-15 03:15:57.163644', 'step': 166, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:15:57.194848', 'step': 166, 'epoch': 1} {'type': 'loss', 'content': 0.021568192169070244, 'timestamp': '2025-09-15 03:15:57.198723', 'step': 167, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:15:57.228866', 'step': 167, 'epoch': 1} {'type': 'loss', 'content': 0.025854144245386124, 'timestamp': '2025-09-15 03:15:57.252377', 'step': 168, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:15:57.283626', 'step': 168, 'epoch': 1} {'type': 'loss', 'content': 0.02288070134818554, 'timestamp': '2025-09-15 03:15:57.288305', 'step': 169, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:15:57.318847', 'step': 169, 'epoch': 1} {'type': 'loss', 'content': 0.02382759191095829, 'timestamp': '2025-09-15 03:15:57.322644', 'step': 170, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:15:57.353391', 'step': 170, 'epoch': 1} {'type': 'loss', 'content': 0.02871537022292614, 'timestamp': '2025-09-15 03:15:57.355815', 'step': 171, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:15:57.387190', 'step': 171, 'epoch': 1} {'type': 'loss', 'content': 0.023803409188985825, 'timestamp': '2025-09-15 03:15:57.414869', 'step': 172, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:15:57.445955', 'step': 172, 'epoch': 1} {'type': 'loss', 'content': 0.02843991480767727, 'timestamp': '2025-09-15 03:15:57.448143', 'step': 173, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:15:57.479106', 'step': 173, 'epoch': 1} {'type': 'loss', 'content': 0.024328526109457016, 'timestamp': '2025-09-15 03:15:57.482545', 'step': 174, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:15:57.514158', 'step': 174, 'epoch': 1} {'type': 'loss', 'content': 0.01945444755256176, 'timestamp': '2025-09-15 03:15:57.518490', 'step': 175, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:15:57.549260', 'step': 175, 'epoch': 1} {'type': 'loss', 'content': 0.02223912812769413, 'timestamp': '2025-09-15 03:15:57.572972', 'step': 176, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:15:57.603100', 'step': 176, 'epoch': 1} {'type': 'loss', 'content': 0.02585512213408947, 'timestamp': '2025-09-15 03:15:57.605148', 'step': 177, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:15:57.636119', 'step': 177, 'epoch': 1} {'type': 'loss', 'content': 0.018943537026643753, 'timestamp': '2025-09-15 03:15:57.638209', 'step': 178, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:15:57.669228', 'step': 178, 'epoch': 1} {'type': 'loss', 'content': 0.021992197260260582, 'timestamp': '2025-09-15 03:15:57.673270', 'step': 179, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:15:57.703671', 'step': 179, 'epoch': 1} {'type': 'loss', 'content': 0.022745851427316666, 'timestamp': '2025-09-15 03:15:57.727440', 'step': 180, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:15:57.758530', 'step': 180, 'epoch': 1} {'type': 'loss', 'content': 0.024968743324279785, 'timestamp': '2025-09-15 03:15:57.760547', 'step': 181, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:15:57.791178', 'step': 181, 'epoch': 1} {'type': 'loss', 'content': 0.02671198733150959, 'timestamp': '2025-09-15 03:15:57.798469', 'step': 182, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:15:57.828950', 'step': 182, 'epoch': 1} {'type': 'loss', 'content': 0.02292976714670658, 'timestamp': '2025-09-15 03:15:57.830983', 'step': 183, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:15:57.862123', 'step': 183, 'epoch': 1} {'type': 'loss', 'content': 0.021866338327527046, 'timestamp': '2025-09-15 03:15:57.885897', 'step': 184, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:15:57.916912', 'step': 184, 'epoch': 1} {'type': 'loss', 'content': 0.022539768368005753, 'timestamp': '2025-09-15 03:15:57.918855', 'step': 185, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:15:57.949032', 'step': 185, 'epoch': 1} {'type': 'loss', 'content': 0.024748781695961952, 'timestamp': '2025-09-15 03:15:57.952890', 'step': 186, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:15:57.983690', 'step': 186, 'epoch': 1} {'type': 'loss', 'content': 0.025110218673944473, 'timestamp': '2025-09-15 03:15:57.985849', 'step': 187, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:15:58.019225', 'step': 187, 'epoch': 1} {'type': 'loss', 'content': 0.024864723905920982, 'timestamp': '2025-09-15 03:15:58.046438', 'step': 188, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:15:58.078628', 'step': 188, 'epoch': 1} {'type': 'loss', 'content': 0.020917575806379318, 'timestamp': '2025-09-15 03:15:58.080833', 'step': 189, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:15:58.110730', 'step': 189, 'epoch': 1} {'type': 'loss', 'content': 0.022952692583203316, 'timestamp': '2025-09-15 03:15:58.114832', 'step': 190, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:15:58.144868', 'step': 190, 'epoch': 1} {'type': 'loss', 'content': 0.026193810626864433, 'timestamp': '2025-09-15 03:15:58.146922', 'step': 191, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:15:58.177536', 'step': 191, 'epoch': 1} {'type': 'loss', 'content': 0.024926025420427322, 'timestamp': '2025-09-15 03:15:58.200957', 'step': 192, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:15:58.232462', 'step': 192, 'epoch': 1} {'type': 'loss', 'content': 0.024060726165771484, 'timestamp': '2025-09-15 03:15:58.234556', 'step': 193, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:15:58.265855', 'step': 193, 'epoch': 1} {'type': 'loss', 'content': 0.023254675790667534, 'timestamp': '2025-09-15 03:15:58.269786', 'step': 194, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:15:58.300019', 'step': 194, 'epoch': 1} {'type': 'loss', 'content': 0.021828945726156235, 'timestamp': '2025-09-15 03:15:58.302229', 'step': 195, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:15:58.890853', 'step': 195, 'epoch': 1} {'type': 'pplx', 'content': 63240685.34191352, 'timestamp': '2025-09-15 03:15:58.892845', 'step': 195, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:15:58.921710', 'step': 195, 'epoch': 1} {'type': 'loss', 'content': 0.02316051349043846, 'timestamp': '2025-09-15 03:15:58.946872', 'step': 196, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:15:58.977535', 'step': 196, 'epoch': 1} {'type': 'loss', 'content': 0.021014168858528137, 'timestamp': '2025-09-15 03:15:58.980003', 'step': 197, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:15:59.010514', 'step': 197, 'epoch': 1} {'type': 'loss', 'content': 0.02295663394033909, 'timestamp': '2025-09-15 03:15:59.012910', 'step': 198, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:15:59.043932', 'step': 198, 'epoch': 1} {'type': 'loss', 'content': 0.020413899794220924, 'timestamp': '2025-09-15 03:15:59.050830', 'step': 199, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:15:59.081558', 'step': 199, 'epoch': 1} {'type': 'loss', 'content': 0.027345729991793633, 'timestamp': '2025-09-15 03:15:59.109124', 'step': 200, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:15:59.140881', 'step': 200, 'epoch': 1} {'type': 'loss', 'content': 0.02064964361488819, 'timestamp': '2025-09-15 03:15:59.145002', 'step': 201, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:15:59.176590', 'step': 201, 'epoch': 1} {'type': 'loss', 'content': 0.02292543835937977, 'timestamp': '2025-09-15 03:15:59.180177', 'step': 202, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:15:59.210918', 'step': 202, 'epoch': 1} {'type': 'loss', 'content': 0.02036006562411785, 'timestamp': '2025-09-15 03:15:59.212954', 'step': 203, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:15:59.244523', 'step': 203, 'epoch': 1} {'type': 'loss', 'content': 0.025549402460455894, 'timestamp': '2025-09-15 03:15:59.267999', 'step': 204, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:15:59.298808', 'step': 204, 'epoch': 1} {'type': 'loss', 'content': 0.02759888581931591, 'timestamp': '2025-09-15 03:15:59.301063', 'step': 205, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:15:59.332024', 'step': 205, 'epoch': 1} {'type': 'loss', 'content': 0.02341906912624836, 'timestamp': '2025-09-15 03:15:59.334162', 'step': 206, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:15:59.364829', 'step': 206, 'epoch': 1} {'type': 'loss', 'content': 0.019058359786868095, 'timestamp': '2025-09-15 03:15:59.367043', 'step': 207, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:15:59.398513', 'step': 207, 'epoch': 1} {'type': 'loss', 'content': 0.023317178711295128, 'timestamp': '2025-09-15 03:15:59.426222', 'step': 208, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:15:59.457169', 'step': 208, 'epoch': 1} {'type': 'loss', 'content': 0.02509636990725994, 'timestamp': '2025-09-15 03:15:59.459243', 'step': 209, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:15:59.489381', 'step': 209, 'epoch': 1} {'type': 'loss', 'content': 0.02863742969930172, 'timestamp': '2025-09-15 03:15:59.491500', 'step': 210, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:15:59.522239', 'step': 210, 'epoch': 1} {'type': 'loss', 'content': 0.026905078440904617, 'timestamp': '2025-09-15 03:15:59.528504', 'step': 211, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:15:59.559391', 'step': 211, 'epoch': 1} {'type': 'loss', 'content': 0.020303964614868164, 'timestamp': '2025-09-15 03:15:59.582997', 'step': 212, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:15:59.613583', 'step': 212, 'epoch': 1} {'type': 'loss', 'content': 0.022942854091525078, 'timestamp': '2025-09-15 03:15:59.615904', 'step': 213, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:15:59.646749', 'step': 213, 'epoch': 1} {'type': 'loss', 'content': 0.029736299067735672, 'timestamp': '2025-09-15 03:15:59.650708', 'step': 214, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:15:59.681125', 'step': 214, 'epoch': 1} {'type': 'loss', 'content': 0.017598429694771767, 'timestamp': '2025-09-15 03:15:59.683644', 'step': 215, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:15:59.714059', 'step': 215, 'epoch': 1} {'type': 'loss', 'content': 0.020063329488039017, 'timestamp': '2025-09-15 03:15:59.737749', 'step': 216, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:15:59.768824', 'step': 216, 'epoch': 1} {'type': 'loss', 'content': 0.023120734840631485, 'timestamp': '2025-09-15 03:15:59.770897', 'step': 217, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-15 03:15:59.802112', 'step': 217, 'epoch': 1} {'type': 'loss', 'content': 0.01837027072906494, 'timestamp': '2025-09-15 03:15:59.809582', 'step': 218, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:15:59.842622', 'step': 218, 'epoch': 1} {'type': 'loss', 'content': 0.02000943198800087, 'timestamp': '2025-09-15 03:15:59.846596', 'step': 219, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-15 03:15:59.877789', 'step': 219, 'epoch': 1} {'type': 'loss', 'content': 0.02292727492749691, 'timestamp': '2025-09-15 03:15:59.906404', 'step': 220, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:15:59.937325', 'step': 220, 'epoch': 1} {'type': 'loss', 'content': 0.023077070713043213, 'timestamp': '2025-09-15 03:15:59.939530', 'step': 221, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-15 03:15:59.970641', 'step': 221, 'epoch': 1} {'type': 'loss', 'content': 0.023584123700857162, 'timestamp': '2025-09-15 03:15:59.980604', 'step': 222, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:00.010909', 'step': 222, 'epoch': 1} {'type': 'loss', 'content': 0.023418009281158447, 'timestamp': '2025-09-15 03:16:00.017626', 'step': 223, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:00.050416', 'step': 223, 'epoch': 1} {'type': 'loss', 'content': 0.023128027096390724, 'timestamp': '2025-09-15 03:16:00.077827', 'step': 224, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:00.111619', 'step': 224, 'epoch': 1} {'type': 'loss', 'content': 0.025857388973236084, 'timestamp': '2025-09-15 03:16:00.118405', 'step': 225, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:00.150814', 'step': 225, 'epoch': 1} {'type': 'loss', 'content': 0.02984294295310974, 'timestamp': '2025-09-15 03:16:00.155137', 'step': 226, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:00.188163', 'step': 226, 'epoch': 1} {'type': 'loss', 'content': 0.023621682077646255, 'timestamp': '2025-09-15 03:16:00.192943', 'step': 227, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:00.222693', 'step': 227, 'epoch': 1} {'type': 'loss', 'content': 0.023447996005415916, 'timestamp': '2025-09-15 03:16:00.246168', 'step': 228, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:00.276645', 'step': 228, 'epoch': 1} {'type': 'loss', 'content': 0.016722386702895164, 'timestamp': '2025-09-15 03:16:00.281477', 'step': 229, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:00.311349', 'step': 229, 'epoch': 1} {'type': 'loss', 'content': 0.01983584091067314, 'timestamp': '2025-09-15 03:16:00.314175', 'step': 230, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:00.344247', 'step': 230, 'epoch': 1} {'type': 'loss', 'content': 0.02162855677306652, 'timestamp': '2025-09-15 03:16:00.346282', 'step': 231, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:00.376759', 'step': 231, 'epoch': 1} {'type': 'loss', 'content': 0.0186337660998106, 'timestamp': '2025-09-15 03:16:00.405397', 'step': 232, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:00.435669', 'step': 232, 'epoch': 1} {'type': 'loss', 'content': 0.014806436374783516, 'timestamp': '2025-09-15 03:16:00.441105', 'step': 233, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:00.471606', 'step': 233, 'epoch': 1} {'type': 'loss', 'content': 0.025260770693421364, 'timestamp': '2025-09-15 03:16:00.473838', 'step': 234, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:16:01.059105', 'step': 234, 'epoch': 1} {'type': 'pplx', 'content': 68587555.94777116, 'timestamp': '2025-09-15 03:16:01.061841', 'step': 234, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:01.090645', 'step': 234, 'epoch': 1} {'type': 'loss', 'content': 0.023948902264237404, 'timestamp': '2025-09-15 03:16:01.097497', 'step': 235, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-15 03:16:01.138216', 'step': 235, 'epoch': 1} {'type': 'loss', 'content': 0.017647096887230873, 'timestamp': '2025-09-15 03:16:01.166955', 'step': 236, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:01.216207', 'step': 236, 'epoch': 1} {'type': 'loss', 'content': 0.03198563680052757, 'timestamp': '2025-09-15 03:16:01.218513', 'step': 237, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:01.253481', 'step': 237, 'epoch': 1} {'type': 'loss', 'content': 0.01824996992945671, 'timestamp': '2025-09-15 03:16:01.258287', 'step': 238, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:01.292269', 'step': 238, 'epoch': 1} {'type': 'loss', 'content': 0.017625119537115097, 'timestamp': '2025-09-15 03:16:01.295143', 'step': 239, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:01.325745', 'step': 239, 'epoch': 1} {'type': 'loss', 'content': 0.02666931040585041, 'timestamp': '2025-09-15 03:16:01.351393', 'step': 240, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:01.384605', 'step': 240, 'epoch': 1} {'type': 'loss', 'content': 0.02557450346648693, 'timestamp': '2025-09-15 03:16:01.389564', 'step': 241, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:01.426453', 'step': 241, 'epoch': 1} {'type': 'loss', 'content': 0.01579933427274227, 'timestamp': '2025-09-15 03:16:01.430592', 'step': 242, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:01.462033', 'step': 242, 'epoch': 1} {'type': 'loss', 'content': 0.020587289705872536, 'timestamp': '2025-09-15 03:16:01.469014', 'step': 243, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:01.503272', 'step': 243, 'epoch': 1} {'type': 'loss', 'content': 0.022956201806664467, 'timestamp': '2025-09-15 03:16:01.528951', 'step': 244, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:01.559007', 'step': 244, 'epoch': 1} {'type': 'loss', 'content': 0.01939821057021618, 'timestamp': '2025-09-15 03:16:01.564059', 'step': 245, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:01.598428', 'step': 245, 'epoch': 1} {'type': 'loss', 'content': 0.02754567377269268, 'timestamp': '2025-09-15 03:16:01.609592', 'step': 246, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:01.646882', 'step': 246, 'epoch': 1} {'type': 'loss', 'content': 0.019356006756424904, 'timestamp': '2025-09-15 03:16:01.651543', 'step': 247, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:01.681678', 'step': 247, 'epoch': 1} {'type': 'loss', 'content': 0.0224157627671957, 'timestamp': '2025-09-15 03:16:01.706886', 'step': 248, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:01.737442', 'step': 248, 'epoch': 1} {'type': 'loss', 'content': 0.020564500242471695, 'timestamp': '2025-09-15 03:16:01.741870', 'step': 249, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:01.771817', 'step': 249, 'epoch': 1} {'type': 'loss', 'content': 0.01674218662083149, 'timestamp': '2025-09-15 03:16:01.774526', 'step': 250, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:01.804875', 'step': 250, 'epoch': 1} {'type': 'loss', 'content': 0.021581009030342102, 'timestamp': '2025-09-15 03:16:01.807230', 'step': 251, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:01.837827', 'step': 251, 'epoch': 1} {'type': 'loss', 'content': 0.017507784068584442, 'timestamp': '2025-09-15 03:16:01.861517', 'step': 252, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:01.892258', 'step': 252, 'epoch': 1} {'type': 'loss', 'content': 0.01738118752837181, 'timestamp': '2025-09-15 03:16:01.894337', 'step': 253, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:01.924438', 'step': 253, 'epoch': 1} {'type': 'loss', 'content': 0.020651668310165405, 'timestamp': '2025-09-15 03:16:01.928310', 'step': 254, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:01.960365', 'step': 254, 'epoch': 1} {'type': 'loss', 'content': 0.026446225121617317, 'timestamp': '2025-09-15 03:16:01.962690', 'step': 255, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:02.007777', 'step': 255, 'epoch': 1} {'type': 'loss', 'content': 0.014327983371913433, 'timestamp': '2025-09-15 03:16:02.031486', 'step': 256, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:02.061650', 'step': 256, 'epoch': 1} {'type': 'loss', 'content': 0.0174112506210804, 'timestamp': '2025-09-15 03:16:02.066336', 'step': 257, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-15 03:16:02.097433', 'step': 257, 'epoch': 1} {'type': 'loss', 'content': 0.009260174818336964, 'timestamp': '2025-09-15 03:16:02.105288', 'step': 258, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:02.135256', 'step': 258, 'epoch': 1} {'type': 'loss', 'content': 0.010992279276251793, 'timestamp': '2025-09-15 03:16:02.137198', 'step': 259, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:02.167718', 'step': 259, 'epoch': 1} {'type': 'loss', 'content': 0.019353589043021202, 'timestamp': '2025-09-15 03:16:02.195424', 'step': 260, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:02.235178', 'step': 260, 'epoch': 1} {'type': 'loss', 'content': 0.01912912353873253, 'timestamp': '2025-09-15 03:16:02.237274', 'step': 261, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:02.267743', 'step': 261, 'epoch': 1} {'type': 'loss', 'content': 0.014038893394172192, 'timestamp': '2025-09-15 03:16:02.270031', 'step': 262, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:02.299986', 'step': 262, 'epoch': 1} {'type': 'loss', 'content': 0.012295400723814964, 'timestamp': '2025-09-15 03:16:02.301907', 'step': 263, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:02.331731', 'step': 263, 'epoch': 1} {'type': 'loss', 'content': 0.025572476908564568, 'timestamp': '2025-09-15 03:16:02.355181', 'step': 264, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:02.385263', 'step': 264, 'epoch': 1} {'type': 'loss', 'content': 0.015300482511520386, 'timestamp': '2025-09-15 03:16:02.387305', 'step': 265, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:02.417616', 'step': 265, 'epoch': 1} {'type': 'loss', 'content': 0.02356751821935177, 'timestamp': '2025-09-15 03:16:02.419765', 'step': 266, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:02.451581', 'step': 266, 'epoch': 1} {'type': 'loss', 'content': 0.004978724289685488, 'timestamp': '2025-09-15 03:16:02.455768', 'step': 267, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:02.485952', 'step': 267, 'epoch': 1} {'type': 'loss', 'content': 0.022996727377176285, 'timestamp': '2025-09-15 03:16:02.509706', 'step': 268, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:02.540134', 'step': 268, 'epoch': 1} {'type': 'loss', 'content': 0.02457096055150032, 'timestamp': '2025-09-15 03:16:02.542113', 'step': 269, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:02.572318', 'step': 269, 'epoch': 1} {'type': 'loss', 'content': 0.018262630328536034, 'timestamp': '2025-09-15 03:16:02.575065', 'step': 270, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:02.606054', 'step': 270, 'epoch': 1} {'type': 'loss', 'content': 0.02449517324566841, 'timestamp': '2025-09-15 03:16:02.610680', 'step': 271, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:02.641699', 'step': 271, 'epoch': 1} {'type': 'loss', 'content': 0.025416718795895576, 'timestamp': '2025-09-15 03:16:02.669505', 'step': 272, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:02.699836', 'step': 272, 'epoch': 1} {'type': 'loss', 'content': 0.023906651884317398, 'timestamp': '2025-09-15 03:16:02.705064', 'step': 273, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:16:03.293915', 'step': 273, 'epoch': 1} {'type': 'pplx', 'content': 77374753.92709866, 'timestamp': '2025-09-15 03:16:03.295807', 'step': 273, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:03.324814', 'step': 273, 'epoch': 1} {'type': 'loss', 'content': 0.011430694721639156, 'timestamp': '2025-09-15 03:16:03.331480', 'step': 274, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:16:03.360944', 'step': 274, 'epoch': 1} {'type': 'loss', 'content': 0.026782521978020668, 'timestamp': '2025-09-15 03:16:03.362990', 'step': 275, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-15 03:16:03.393194', 'step': 275, 'epoch': 1} {'type': 'loss', 'content': 0.012034070678055286, 'timestamp': '2025-09-15 03:16:03.424345', 'step': 276, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:03.454416', 'step': 276, 'epoch': 1} {'type': 'loss', 'content': 0.016790401190519333, 'timestamp': '2025-09-15 03:16:03.457660', 'step': 277, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:03.488260', 'step': 277, 'epoch': 1} {'type': 'loss', 'content': 0.01346579473465681, 'timestamp': '2025-09-15 03:16:03.495872', 'step': 278, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:03.526113', 'step': 278, 'epoch': 1} {'type': 'loss', 'content': 0.022128688171505928, 'timestamp': '2025-09-15 03:16:03.530737', 'step': 279, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:03.561100', 'step': 279, 'epoch': 1} {'type': 'loss', 'content': 0.012642556801438332, 'timestamp': '2025-09-15 03:16:03.586256', 'step': 280, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:03.616278', 'step': 280, 'epoch': 1} {'type': 'loss', 'content': 0.02572466991841793, 'timestamp': '2025-09-15 03:16:03.618722', 'step': 281, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:03.653866', 'step': 281, 'epoch': 1} {'type': 'loss', 'content': 0.017196912318468094, 'timestamp': '2025-09-15 03:16:03.656866', 'step': 282, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-15 03:16:03.689693', 'step': 282, 'epoch': 1} {'type': 'loss', 'content': 0.02792796678841114, 'timestamp': '2025-09-15 03:16:03.697318', 'step': 283, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-15 03:16:03.739770', 'step': 283, 'epoch': 1} {'type': 'loss', 'content': 0.025717342272400856, 'timestamp': '2025-09-15 03:16:03.767450', 'step': 284, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:03.798160', 'step': 284, 'epoch': 1} {'type': 'loss', 'content': 0.027002578601241112, 'timestamp': '2025-09-15 03:16:03.800246', 'step': 285, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:03.830992', 'step': 285, 'epoch': 1} {'type': 'loss', 'content': 0.025492871180176735, 'timestamp': '2025-09-15 03:16:03.837887', 'step': 286, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:03.868229', 'step': 286, 'epoch': 1} {'type': 'loss', 'content': 0.018493708223104477, 'timestamp': '2025-09-15 03:16:03.875013', 'step': 287, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:03.905566', 'step': 287, 'epoch': 1} {'type': 'loss', 'content': 0.008588691242039204, 'timestamp': '2025-09-15 03:16:03.931240', 'step': 288, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:03.961557', 'step': 288, 'epoch': 1} {'type': 'loss', 'content': 0.003366441000252962, 'timestamp': '2025-09-15 03:16:03.964499', 'step': 289, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:03.995942', 'step': 289, 'epoch': 1} {'type': 'loss', 'content': 0.019292891025543213, 'timestamp': '2025-09-15 03:16:04.000649', 'step': 290, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:04.031152', 'step': 290, 'epoch': 1} {'type': 'loss', 'content': 0.02494696155190468, 'timestamp': '2025-09-15 03:16:04.033506', 'step': 291, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:04.064036', 'step': 291, 'epoch': 1} {'type': 'loss', 'content': 0.012115180492401123, 'timestamp': '2025-09-15 03:16:04.088222', 'step': 292, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:04.118593', 'step': 292, 'epoch': 1} {'type': 'loss', 'content': 0.016373474150896072, 'timestamp': '2025-09-15 03:16:04.120561', 'step': 293, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:04.150255', 'step': 293, 'epoch': 1} {'type': 'loss', 'content': 0.01117027085274458, 'timestamp': '2025-09-15 03:16:04.152433', 'step': 294, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:04.183598', 'step': 294, 'epoch': 1} {'type': 'loss', 'content': 0.009900221601128578, 'timestamp': '2025-09-15 03:16:04.190417', 'step': 295, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:04.221134', 'step': 295, 'epoch': 1} {'type': 'loss', 'content': 0.022509068250656128, 'timestamp': '2025-09-15 03:16:04.244752', 'step': 296, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:04.274919', 'step': 296, 'epoch': 1} {'type': 'loss', 'content': 0.005992523860186338, 'timestamp': '2025-09-15 03:16:04.276991', 'step': 297, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:04.307606', 'step': 297, 'epoch': 1} {'type': 'loss', 'content': 0.033216144889593124, 'timestamp': '2025-09-15 03:16:04.312027', 'step': 298, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:04.342252', 'step': 298, 'epoch': 1} {'type': 'loss', 'content': 0.012912544421851635, 'timestamp': '2025-09-15 03:16:04.346608', 'step': 299, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:04.376479', 'step': 299, 'epoch': 1} {'type': 'loss', 'content': 0.02481652982532978, 'timestamp': '2025-09-15 03:16:04.404690', 'step': 300, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:04.441606', 'step': 300, 'epoch': 1} {'type': 'loss', 'content': 0.01673789881169796, 'timestamp': '2025-09-15 03:16:04.443856', 'step': 301, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:04.474536', 'step': 301, 'epoch': 1} {'type': 'loss', 'content': 0.03715262562036514, 'timestamp': '2025-09-15 03:16:04.478669', 'step': 302, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:04.508637', 'step': 302, 'epoch': 1} {'type': 'loss', 'content': 0.004444826859980822, 'timestamp': '2025-09-15 03:16:04.513313', 'step': 303, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:04.543214', 'step': 303, 'epoch': 1} {'type': 'loss', 'content': 0.005952953360974789, 'timestamp': '2025-09-15 03:16:04.566759', 'step': 304, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:16:04.596758', 'step': 304, 'epoch': 1} {'type': 'loss', 'content': 0.03581325337290764, 'timestamp': '2025-09-15 03:16:04.598732', 'step': 305, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:04.628891', 'step': 305, 'epoch': 1} {'type': 'loss', 'content': 0.009639021940529346, 'timestamp': '2025-09-15 03:16:04.635954', 'step': 306, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:04.666569', 'step': 306, 'epoch': 1} {'type': 'loss', 'content': 0.03855990990996361, 'timestamp': '2025-09-15 03:16:04.673739', 'step': 307, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:04.705876', 'step': 307, 'epoch': 1} {'type': 'loss', 'content': 0.013932475820183754, 'timestamp': '2025-09-15 03:16:04.734125', 'step': 308, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:04.764417', 'step': 308, 'epoch': 1} {'type': 'loss', 'content': 0.00806758925318718, 'timestamp': '2025-09-15 03:16:04.766439', 'step': 309, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:04.796896', 'step': 309, 'epoch': 1} {'type': 'loss', 'content': 0.0032700072042644024, 'timestamp': '2025-09-15 03:16:04.803803', 'step': 310, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:04.834028', 'step': 310, 'epoch': 1} {'type': 'loss', 'content': 0.006026607472449541, 'timestamp': '2025-09-15 03:16:04.835941', 'step': 311, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:04.865621', 'step': 311, 'epoch': 1} {'type': 'loss', 'content': 0.013435856439173222, 'timestamp': '2025-09-15 03:16:04.889411', 'step': 312, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:16:05.473420', 'step': 312, 'epoch': 1} {'type': 'pplx', 'content': 89022351.57575472, 'timestamp': '2025-09-15 03:16:05.475374', 'step': 312, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:05.502935', 'step': 312, 'epoch': 1} {'type': 'loss', 'content': 0.036437805742025375, 'timestamp': '2025-09-15 03:16:05.505087', 'step': 313, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:16:05.535909', 'step': 313, 'epoch': 1} {'type': 'loss', 'content': 0.05053964629769325, 'timestamp': '2025-09-15 03:16:05.538056', 'step': 314, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:05.568376', 'step': 314, 'epoch': 1} {'type': 'loss', 'content': 0.017013149335980415, 'timestamp': '2025-09-15 03:16:05.572587', 'step': 315, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:05.603024', 'step': 315, 'epoch': 1} {'type': 'loss', 'content': 0.002634893637150526, 'timestamp': '2025-09-15 03:16:05.628501', 'step': 316, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:05.658537', 'step': 316, 'epoch': 1} {'type': 'loss', 'content': 0.005089492071419954, 'timestamp': '2025-09-15 03:16:05.660675', 'step': 317, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:05.690788', 'step': 317, 'epoch': 1} {'type': 'loss', 'content': 0.004525291733443737, 'timestamp': '2025-09-15 03:16:05.693367', 'step': 318, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:05.722921', 'step': 318, 'epoch': 1} {'type': 'loss', 'content': 0.016721466556191444, 'timestamp': '2025-09-15 03:16:05.724874', 'step': 319, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:05.754870', 'step': 319, 'epoch': 1} {'type': 'loss', 'content': 0.032533131539821625, 'timestamp': '2025-09-15 03:16:05.778422', 'step': 320, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:05.808211', 'step': 320, 'epoch': 1} {'type': 'loss', 'content': 0.004051280207931995, 'timestamp': '2025-09-15 03:16:05.810112', 'step': 321, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:05.840928', 'step': 321, 'epoch': 1} {'type': 'loss', 'content': 0.007642671465873718, 'timestamp': '2025-09-15 03:16:05.848333', 'step': 322, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:05.878559', 'step': 322, 'epoch': 1} {'type': 'loss', 'content': 0.004581484943628311, 'timestamp': '2025-09-15 03:16:05.883138', 'step': 323, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:05.914276', 'step': 323, 'epoch': 1} {'type': 'loss', 'content': 0.009225315414369106, 'timestamp': '2025-09-15 03:16:05.939314', 'step': 324, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:05.969234', 'step': 324, 'epoch': 1} {'type': 'loss', 'content': 0.0009719752706587315, 'timestamp': '2025-09-15 03:16:05.971499', 'step': 325, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:06.001728', 'step': 325, 'epoch': 1} {'type': 'loss', 'content': 0.033414021134376526, 'timestamp': '2025-09-15 03:16:06.005825', 'step': 326, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:06.036288', 'step': 326, 'epoch': 1} {'type': 'loss', 'content': 0.0025679587852209806, 'timestamp': '2025-09-15 03:16:06.038522', 'step': 327, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:06.068012', 'step': 327, 'epoch': 1} {'type': 'loss', 'content': 0.00764654204249382, 'timestamp': '2025-09-15 03:16:06.091669', 'step': 328, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:06.123720', 'step': 328, 'epoch': 1} {'type': 'loss', 'content': 0.02005215547978878, 'timestamp': '2025-09-15 03:16:06.128928', 'step': 329, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:06.158880', 'step': 329, 'epoch': 1} {'type': 'loss', 'content': 0.02345801331102848, 'timestamp': '2025-09-15 03:16:06.161011', 'step': 330, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:06.190959', 'step': 330, 'epoch': 1} {'type': 'loss', 'content': 0.02312997356057167, 'timestamp': '2025-09-15 03:16:06.193407', 'step': 331, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:06.224681', 'step': 331, 'epoch': 1} {'type': 'loss', 'content': 0.020947817713022232, 'timestamp': '2025-09-15 03:16:06.248142', 'step': 332, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:06.278708', 'step': 332, 'epoch': 1} {'type': 'loss', 'content': 0.06012619286775589, 'timestamp': '2025-09-15 03:16:06.283450', 'step': 333, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:06.330792', 'step': 333, 'epoch': 1} {'type': 'loss', 'content': 0.03723708540201187, 'timestamp': '2025-09-15 03:16:06.333611', 'step': 334, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:06.363381', 'step': 334, 'epoch': 1} {'type': 'loss', 'content': 0.010374733246862888, 'timestamp': '2025-09-15 03:16:06.365496', 'step': 335, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:06.395558', 'step': 335, 'epoch': 1} {'type': 'loss', 'content': 0.02376265451312065, 'timestamp': '2025-09-15 03:16:06.419128', 'step': 336, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:06.448906', 'step': 336, 'epoch': 1} {'type': 'loss', 'content': 0.04077703505754471, 'timestamp': '2025-09-15 03:16:06.450933', 'step': 337, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:16:06.480718', 'step': 337, 'epoch': 1} {'type': 'loss', 'content': 0.021582217887043953, 'timestamp': '2025-09-15 03:16:06.482904', 'step': 338, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:06.513294', 'step': 338, 'epoch': 1} {'type': 'loss', 'content': 0.043330080807209015, 'timestamp': '2025-09-15 03:16:06.515453', 'step': 339, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:06.545615', 'step': 339, 'epoch': 1} {'type': 'loss', 'content': 0.023107782006263733, 'timestamp': '2025-09-15 03:16:06.569241', 'step': 340, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:06.599564', 'step': 340, 'epoch': 1} {'type': 'loss', 'content': 0.026244070380926132, 'timestamp': '2025-09-15 03:16:06.604508', 'step': 341, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:16:06.634271', 'step': 341, 'epoch': 1} {'type': 'loss', 'content': 0.037476424127817154, 'timestamp': '2025-09-15 03:16:06.636382', 'step': 342, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:06.666516', 'step': 342, 'epoch': 1} {'type': 'loss', 'content': 0.0057834298349916935, 'timestamp': '2025-09-15 03:16:06.670570', 'step': 343, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:06.701597', 'step': 343, 'epoch': 1} {'type': 'loss', 'content': 0.01934720017015934, 'timestamp': '2025-09-15 03:16:06.725330', 'step': 344, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:06.755623', 'step': 344, 'epoch': 1} {'type': 'loss', 'content': 0.005825077183544636, 'timestamp': '2025-09-15 03:16:06.760328', 'step': 345, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:06.790320', 'step': 345, 'epoch': 1} {'type': 'loss', 'content': 0.008595766499638557, 'timestamp': '2025-09-15 03:16:06.794700', 'step': 346, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:06.824602', 'step': 346, 'epoch': 1} {'type': 'loss', 'content': 0.010267524980008602, 'timestamp': '2025-09-15 03:16:06.827384', 'step': 347, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:06.857976', 'step': 347, 'epoch': 1} {'type': 'loss', 'content': 0.011575107462704182, 'timestamp': '2025-09-15 03:16:06.883325', 'step': 348, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:06.913632', 'step': 348, 'epoch': 1} {'type': 'loss', 'content': 0.02673574723303318, 'timestamp': '2025-09-15 03:16:06.915749', 'step': 349, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:06.945985', 'step': 349, 'epoch': 1} {'type': 'loss', 'content': 0.0218326635658741, 'timestamp': '2025-09-15 03:16:06.953220', 'step': 350, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:06.983094', 'step': 350, 'epoch': 1} {'type': 'loss', 'content': 0.0157585758715868, 'timestamp': '2025-09-15 03:16:06.987430', 'step': 351, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:16:07.574117', 'step': 351, 'epoch': 1} {'type': 'pplx', 'content': 80515886.71850136, 'timestamp': '2025-09-15 03:16:07.575945', 'step': 351, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:07.605506', 'step': 351, 'epoch': 1} {'type': 'loss', 'content': 0.02800087444484234, 'timestamp': '2025-09-15 03:16:07.632636', 'step': 352, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:07.663026', 'step': 352, 'epoch': 1} {'type': 'loss', 'content': 0.04366505146026611, 'timestamp': '2025-09-15 03:16:07.664951', 'step': 353, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:07.694873', 'step': 353, 'epoch': 1} {'type': 'loss', 'content': 0.018956486135721207, 'timestamp': '2025-09-15 03:16:07.697241', 'step': 354, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:07.728686', 'step': 354, 'epoch': 1} {'type': 'loss', 'content': 0.01885378547012806, 'timestamp': '2025-09-15 03:16:07.730703', 'step': 355, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:07.761101', 'step': 355, 'epoch': 1} {'type': 'loss', 'content': 0.027556609362363815, 'timestamp': '2025-09-15 03:16:07.789151', 'step': 356, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:07.819784', 'step': 356, 'epoch': 1} {'type': 'loss', 'content': 0.021303247660398483, 'timestamp': '2025-09-15 03:16:07.821766', 'step': 357, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:16:07.851821', 'step': 357, 'epoch': 1} {'type': 'loss', 'content': 0.026625454425811768, 'timestamp': '2025-09-15 03:16:07.853766', 'step': 358, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:07.884845', 'step': 358, 'epoch': 1} {'type': 'loss', 'content': 0.012120525352656841, 'timestamp': '2025-09-15 03:16:07.887214', 'step': 359, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-15 03:16:07.917751', 'step': 359, 'epoch': 1} {'type': 'loss', 'content': 0.013295002281665802, 'timestamp': '2025-09-15 03:16:07.946614', 'step': 360, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:07.976683', 'step': 360, 'epoch': 1} {'type': 'loss', 'content': 0.01189348753541708, 'timestamp': '2025-09-15 03:16:07.978817', 'step': 361, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:08.008998', 'step': 361, 'epoch': 1} {'type': 'loss', 'content': 0.020258372649550438, 'timestamp': '2025-09-15 03:16:08.013390', 'step': 362, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:08.044661', 'step': 362, 'epoch': 1} {'type': 'loss', 'content': 0.02005624771118164, 'timestamp': '2025-09-15 03:16:08.049028', 'step': 363, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:08.079058', 'step': 363, 'epoch': 1} {'type': 'loss', 'content': 0.025262445211410522, 'timestamp': '2025-09-15 03:16:08.102696', 'step': 364, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:08.132990', 'step': 364, 'epoch': 1} {'type': 'loss', 'content': 0.014361625537276268, 'timestamp': '2025-09-15 03:16:08.135138', 'step': 365, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:08.165036', 'step': 365, 'epoch': 1} {'type': 'loss', 'content': 0.011819976381957531, 'timestamp': '2025-09-15 03:16:08.167828', 'step': 366, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:08.198223', 'step': 366, 'epoch': 1} {'type': 'loss', 'content': 0.014466187916696072, 'timestamp': '2025-09-15 03:16:08.200731', 'step': 367, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:08.231093', 'step': 367, 'epoch': 1} {'type': 'loss', 'content': 0.02088986523449421, 'timestamp': '2025-09-15 03:16:08.256553', 'step': 368, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:08.286505', 'step': 368, 'epoch': 1} {'type': 'loss', 'content': 0.01293105911463499, 'timestamp': '2025-09-15 03:16:08.288490', 'step': 369, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:08.318654', 'step': 369, 'epoch': 1} {'type': 'loss', 'content': 0.014627523720264435, 'timestamp': '2025-09-15 03:16:08.323105', 'step': 370, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:08.353213', 'step': 370, 'epoch': 1} {'type': 'loss', 'content': 0.020171448588371277, 'timestamp': '2025-09-15 03:16:08.357974', 'step': 371, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:08.387988', 'step': 371, 'epoch': 1} {'type': 'loss', 'content': 0.0217885784804821, 'timestamp': '2025-09-15 03:16:08.411714', 'step': 372, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:08.441612', 'step': 372, 'epoch': 1} {'type': 'loss', 'content': 0.015202087350189686, 'timestamp': '2025-09-15 03:16:08.443503', 'step': 373, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:08.473812', 'step': 373, 'epoch': 1} {'type': 'loss', 'content': 0.01769433729350567, 'timestamp': '2025-09-15 03:16:08.476796', 'step': 374, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:08.507228', 'step': 374, 'epoch': 1} {'type': 'loss', 'content': 0.020400600507855415, 'timestamp': '2025-09-15 03:16:08.511747', 'step': 375, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:08.541899', 'step': 375, 'epoch': 1} {'type': 'loss', 'content': 0.02527294121682644, 'timestamp': '2025-09-15 03:16:08.569894', 'step': 376, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:08.601274', 'step': 376, 'epoch': 1} {'type': 'loss', 'content': 0.01625806838274002, 'timestamp': '2025-09-15 03:16:08.604014', 'step': 377, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:08.635058', 'step': 377, 'epoch': 1} {'type': 'loss', 'content': 0.010705024935305119, 'timestamp': '2025-09-15 03:16:08.642680', 'step': 378, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:08.672820', 'step': 378, 'epoch': 1} {'type': 'loss', 'content': 0.027743404731154442, 'timestamp': '2025-09-15 03:16:08.677407', 'step': 379, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:08.707603', 'step': 379, 'epoch': 1} {'type': 'loss', 'content': 0.01390306930989027, 'timestamp': '2025-09-15 03:16:08.731121', 'step': 380, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:08.761695', 'step': 380, 'epoch': 1} {'type': 'loss', 'content': 0.015007426962256432, 'timestamp': '2025-09-15 03:16:08.763849', 'step': 381, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:08.794662', 'step': 381, 'epoch': 1} {'type': 'loss', 'content': 0.020648064091801643, 'timestamp': '2025-09-15 03:16:08.801824', 'step': 382, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:08.832571', 'step': 382, 'epoch': 1} {'type': 'loss', 'content': 0.024542948231101036, 'timestamp': '2025-09-15 03:16:08.839611', 'step': 383, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:08.870954', 'step': 383, 'epoch': 1} {'type': 'loss', 'content': 0.015119172632694244, 'timestamp': '2025-09-15 03:16:08.899056', 'step': 384, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:08.928788', 'step': 384, 'epoch': 1} {'type': 'loss', 'content': 0.013927723281085491, 'timestamp': '2025-09-15 03:16:08.930791', 'step': 385, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:08.960837', 'step': 385, 'epoch': 1} {'type': 'loss', 'content': 0.020588034763932228, 'timestamp': '2025-09-15 03:16:08.968592', 'step': 386, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:08.999144', 'step': 386, 'epoch': 1} {'type': 'loss', 'content': 0.010085856541991234, 'timestamp': '2025-09-15 03:16:09.002084', 'step': 387, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:09.032030', 'step': 387, 'epoch': 1} {'type': 'loss', 'content': 0.025173211470246315, 'timestamp': '2025-09-15 03:16:09.055580', 'step': 388, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:09.085476', 'step': 388, 'epoch': 1} {'type': 'loss', 'content': 0.013549414463341236, 'timestamp': '2025-09-15 03:16:09.087514', 'step': 389, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:09.118462', 'step': 389, 'epoch': 1} {'type': 'loss', 'content': 0.014271484687924385, 'timestamp': '2025-09-15 03:16:09.125291', 'step': 390, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:16:09.714210', 'step': 390, 'epoch': 1} {'type': 'pplx', 'content': 74571334.02457094, 'timestamp': '2025-09-15 03:16:09.716103', 'step': 390, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:09.745211', 'step': 390, 'epoch': 1} {'type': 'loss', 'content': 0.02279849909245968, 'timestamp': '2025-09-15 03:16:09.747530', 'step': 391, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:09.777890', 'step': 391, 'epoch': 1} {'type': 'loss', 'content': 0.024111445993185043, 'timestamp': '2025-09-15 03:16:09.801520', 'step': 392, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:09.832311', 'step': 392, 'epoch': 1} {'type': 'loss', 'content': 0.009041680954396725, 'timestamp': '2025-09-15 03:16:09.837023', 'step': 393, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:09.869468', 'step': 393, 'epoch': 1} {'type': 'loss', 'content': 0.016649093478918076, 'timestamp': '2025-09-15 03:16:09.872050', 'step': 394, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:09.902270', 'step': 394, 'epoch': 1} {'type': 'loss', 'content': 0.011015170253813267, 'timestamp': '2025-09-15 03:16:09.906656', 'step': 395, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-15 03:16:09.937062', 'step': 395, 'epoch': 1} {'type': 'loss', 'content': 0.007471670396625996, 'timestamp': '2025-09-15 03:16:09.968302', 'step': 396, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:09.998501', 'step': 396, 'epoch': 1} {'type': 'loss', 'content': 0.013137311674654484, 'timestamp': '2025-09-15 03:16:10.000642', 'step': 397, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:10.031338', 'step': 397, 'epoch': 1} {'type': 'loss', 'content': 0.028483940288424492, 'timestamp': '2025-09-15 03:16:10.035614', 'step': 398, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:10.066017', 'step': 398, 'epoch': 1} {'type': 'loss', 'content': 0.023605694994330406, 'timestamp': '2025-09-15 03:16:10.073789', 'step': 399, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:10.104027', 'step': 399, 'epoch': 1} {'type': 'loss', 'content': 0.015281885862350464, 'timestamp': '2025-09-15 03:16:10.127551', 'step': 400, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:10.157649', 'step': 400, 'epoch': 1} {'type': 'loss', 'content': 0.013710461556911469, 'timestamp': '2025-09-15 03:16:10.159868', 'step': 401, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:10.191049', 'step': 401, 'epoch': 1} {'type': 'loss', 'content': 0.03442396968603134, 'timestamp': '2025-09-15 03:16:10.197908', 'step': 402, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:10.229151', 'step': 402, 'epoch': 1} {'type': 'loss', 'content': 0.016301406547427177, 'timestamp': '2025-09-15 03:16:10.236174', 'step': 403, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:16:10.266148', 'step': 403, 'epoch': 1} {'type': 'loss', 'content': 0.025119978934526443, 'timestamp': '2025-09-15 03:16:10.290013', 'step': 404, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-15 03:16:10.321276', 'step': 404, 'epoch': 1} {'type': 'loss', 'content': 0.01429225504398346, 'timestamp': '2025-09-15 03:16:10.328633', 'step': 405, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:10.359475', 'step': 405, 'epoch': 1} {'type': 'loss', 'content': 0.004983606748282909, 'timestamp': '2025-09-15 03:16:10.363745', 'step': 406, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:10.395255', 'step': 406, 'epoch': 1} {'type': 'loss', 'content': 0.02256079576909542, 'timestamp': '2025-09-15 03:16:10.402130', 'step': 407, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:10.432333', 'step': 407, 'epoch': 1} {'type': 'loss', 'content': 0.006051547359675169, 'timestamp': '2025-09-15 03:16:10.456188', 'step': 408, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:10.489213', 'step': 408, 'epoch': 1} {'type': 'loss', 'content': 0.01480391900986433, 'timestamp': '2025-09-15 03:16:10.491200', 'step': 409, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:10.521790', 'step': 409, 'epoch': 1} {'type': 'loss', 'content': 0.010563238523900509, 'timestamp': '2025-09-15 03:16:10.523920', 'step': 410, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:10.554697', 'step': 410, 'epoch': 1} {'type': 'loss', 'content': 0.014779276214540005, 'timestamp': '2025-09-15 03:16:10.558329', 'step': 411, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:10.588689', 'step': 411, 'epoch': 1} {'type': 'loss', 'content': 0.005051398184150457, 'timestamp': '2025-09-15 03:16:10.612367', 'step': 412, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:10.643269', 'step': 412, 'epoch': 1} {'type': 'loss', 'content': 0.016271298751235008, 'timestamp': '2025-09-15 03:16:10.645238', 'step': 413, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:10.678636', 'step': 413, 'epoch': 1} {'type': 'loss', 'content': 0.013035394251346588, 'timestamp': '2025-09-15 03:16:10.682660', 'step': 414, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:10.714060', 'step': 414, 'epoch': 1} {'type': 'loss', 'content': 0.019557014107704163, 'timestamp': '2025-09-15 03:16:10.720255', 'step': 415, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:10.752032', 'step': 415, 'epoch': 1} {'type': 'loss', 'content': 0.01767909713089466, 'timestamp': '2025-09-15 03:16:10.775676', 'step': 416, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:10.807108', 'step': 416, 'epoch': 1} {'type': 'loss', 'content': 0.011926579289138317, 'timestamp': '2025-09-15 03:16:10.811832', 'step': 417, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:10.843174', 'step': 417, 'epoch': 1} {'type': 'loss', 'content': 0.01481988001614809, 'timestamp': '2025-09-15 03:16:10.846654', 'step': 418, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-15 03:16:10.894362', 'step': 418, 'epoch': 1} {'type': 'loss', 'content': 0.02013516239821911, 'timestamp': '2025-09-15 03:16:10.902422', 'step': 419, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:10.933117', 'step': 419, 'epoch': 1} {'type': 'loss', 'content': 0.020092064514756203, 'timestamp': '2025-09-15 03:16:10.956524', 'step': 420, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:16:10.987665', 'step': 420, 'epoch': 1} {'type': 'loss', 'content': 0.009992966428399086, 'timestamp': '2025-09-15 03:16:10.990192', 'step': 421, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:11.021351', 'step': 421, 'epoch': 1} {'type': 'loss', 'content': 0.02413918450474739, 'timestamp': '2025-09-15 03:16:11.023888', 'step': 422, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:11.055527', 'step': 422, 'epoch': 1} {'type': 'loss', 'content': 0.005685184150934219, 'timestamp': '2025-09-15 03:16:11.062706', 'step': 423, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:11.093177', 'step': 423, 'epoch': 1} {'type': 'loss', 'content': 0.015884699299931526, 'timestamp': '2025-09-15 03:16:11.116857', 'step': 424, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:11.147706', 'step': 424, 'epoch': 1} {'type': 'loss', 'content': 0.022595107555389404, 'timestamp': '2025-09-15 03:16:11.149911', 'step': 425, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:11.180806', 'step': 425, 'epoch': 1} {'type': 'loss', 'content': 0.005032018758356571, 'timestamp': '2025-09-15 03:16:11.184481', 'step': 426, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:11.215158', 'step': 426, 'epoch': 1} {'type': 'loss', 'content': 0.007993181236088276, 'timestamp': '2025-09-15 03:16:11.217514', 'step': 427, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:11.248219', 'step': 427, 'epoch': 1} {'type': 'loss', 'content': 0.015601440332829952, 'timestamp': '2025-09-15 03:16:11.271699', 'step': 428, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:11.302226', 'step': 428, 'epoch': 1} {'type': 'loss', 'content': 0.022765683010220528, 'timestamp': '2025-09-15 03:16:11.304289', 'step': 429, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:16:11.889127', 'step': 429, 'epoch': 1} {'type': 'pplx', 'content': 82419156.97382797, 'timestamp': '2025-09-15 03:16:11.891298', 'step': 429, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:11.920202', 'step': 429, 'epoch': 1} {'type': 'loss', 'content': 0.007856682874262333, 'timestamp': '2025-09-15 03:16:11.922209', 'step': 430, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:11.953331', 'step': 430, 'epoch': 1} {'type': 'loss', 'content': 0.012552931904792786, 'timestamp': '2025-09-15 03:16:11.960394', 'step': 431, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:11.990508', 'step': 431, 'epoch': 1} {'type': 'loss', 'content': 0.015140875242650509, 'timestamp': '2025-09-15 03:16:12.014126', 'step': 432, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:12.044589', 'step': 432, 'epoch': 1} {'type': 'loss', 'content': 0.011779015883803368, 'timestamp': '2025-09-15 03:16:12.048748', 'step': 433, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:12.079806', 'step': 433, 'epoch': 1} {'type': 'loss', 'content': 0.028670471161603928, 'timestamp': '2025-09-15 03:16:12.083634', 'step': 434, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:12.114367', 'step': 434, 'epoch': 1} {'type': 'loss', 'content': 0.01725790835916996, 'timestamp': '2025-09-15 03:16:12.116305', 'step': 435, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-15 03:16:12.146651', 'step': 435, 'epoch': 1} {'type': 'loss', 'content': 0.023671019822359085, 'timestamp': '2025-09-15 03:16:12.175083', 'step': 436, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:12.206069', 'step': 436, 'epoch': 1} {'type': 'loss', 'content': 0.002824623603373766, 'timestamp': '2025-09-15 03:16:12.208020', 'step': 437, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:12.238555', 'step': 437, 'epoch': 1} {'type': 'loss', 'content': 0.012223334051668644, 'timestamp': '2025-09-15 03:16:12.245523', 'step': 438, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:12.276347', 'step': 438, 'epoch': 1} {'type': 'loss', 'content': 0.010377810336649418, 'timestamp': '2025-09-15 03:16:12.278469', 'step': 439, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:12.309576', 'step': 439, 'epoch': 1} {'type': 'loss', 'content': 0.02659183368086815, 'timestamp': '2025-09-15 03:16:12.333236', 'step': 440, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:12.364400', 'step': 440, 'epoch': 1} {'type': 'loss', 'content': 0.006721060257405043, 'timestamp': '2025-09-15 03:16:12.368362', 'step': 441, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:12.400872', 'step': 441, 'epoch': 1} {'type': 'loss', 'content': 0.035596225410699844, 'timestamp': '2025-09-15 03:16:12.402912', 'step': 442, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:12.434756', 'step': 442, 'epoch': 1} {'type': 'loss', 'content': 0.0020581563003361225, 'timestamp': '2025-09-15 03:16:12.441807', 'step': 443, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:12.472652', 'step': 443, 'epoch': 1} {'type': 'loss', 'content': 0.007875720039010048, 'timestamp': '2025-09-15 03:16:12.496083', 'step': 444, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:12.527247', 'step': 444, 'epoch': 1} {'type': 'loss', 'content': 0.009889481589198112, 'timestamp': '2025-09-15 03:16:12.529371', 'step': 445, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:12.560602', 'step': 445, 'epoch': 1} {'type': 'loss', 'content': 0.03284718841314316, 'timestamp': '2025-09-15 03:16:12.562823', 'step': 446, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:12.593710', 'step': 446, 'epoch': 1} {'type': 'loss', 'content': 0.01849648542702198, 'timestamp': '2025-09-15 03:16:12.596021', 'step': 447, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:16:12.626581', 'step': 447, 'epoch': 1} {'type': 'loss', 'content': 0.03222808614373207, 'timestamp': '2025-09-15 03:16:12.650479', 'step': 448, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:12.681732', 'step': 448, 'epoch': 1} {'type': 'loss', 'content': 0.011861540377140045, 'timestamp': '2025-09-15 03:16:12.683818', 'step': 449, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:12.714390', 'step': 449, 'epoch': 1} {'type': 'loss', 'content': 0.014407354407012463, 'timestamp': '2025-09-15 03:16:12.718207', 'step': 450, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:12.748950', 'step': 450, 'epoch': 1} {'type': 'loss', 'content': 0.005681209731847048, 'timestamp': '2025-09-15 03:16:12.751246', 'step': 451, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:12.781759', 'step': 451, 'epoch': 1} {'type': 'loss', 'content': 0.03094957210123539, 'timestamp': '2025-09-15 03:16:12.805505', 'step': 452, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:12.836427', 'step': 452, 'epoch': 1} {'type': 'loss', 'content': 0.004415884148329496, 'timestamp': '2025-09-15 03:16:12.838703', 'step': 453, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:12.869797', 'step': 453, 'epoch': 1} {'type': 'loss', 'content': 0.023646658286452293, 'timestamp': '2025-09-15 03:16:12.871917', 'step': 454, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:12.902870', 'step': 454, 'epoch': 1} {'type': 'loss', 'content': 0.01825256459414959, 'timestamp': '2025-09-15 03:16:12.906481', 'step': 455, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:12.937048', 'step': 455, 'epoch': 1} {'type': 'loss', 'content': 0.014448435045778751, 'timestamp': '2025-09-15 03:16:12.964490', 'step': 456, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-15 03:16:12.995091', 'step': 456, 'epoch': 1} {'type': 'loss', 'content': 0.003266042796894908, 'timestamp': '2025-09-15 03:16:13.002712', 'step': 457, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:13.032840', 'step': 457, 'epoch': 1} {'type': 'loss', 'content': 0.010741782374680042, 'timestamp': '2025-09-15 03:16:13.039677', 'step': 458, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:13.070181', 'step': 458, 'epoch': 1} {'type': 'loss', 'content': 0.023588094860315323, 'timestamp': '2025-09-15 03:16:13.072505', 'step': 459, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:13.102950', 'step': 459, 'epoch': 1} {'type': 'loss', 'content': 0.013890265487134457, 'timestamp': '2025-09-15 03:16:13.126602', 'step': 460, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:13.157332', 'step': 460, 'epoch': 1} {'type': 'loss', 'content': 0.03365600109100342, 'timestamp': '2025-09-15 03:16:13.159365', 'step': 461, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:13.189741', 'step': 461, 'epoch': 1} {'type': 'loss', 'content': 0.011849649250507355, 'timestamp': '2025-09-15 03:16:13.191833', 'step': 462, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:13.222366', 'step': 462, 'epoch': 1} {'type': 'loss', 'content': 0.013702219352126122, 'timestamp': '2025-09-15 03:16:13.224423', 'step': 463, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:13.254781', 'step': 463, 'epoch': 1} {'type': 'loss', 'content': 0.004361593630164862, 'timestamp': '2025-09-15 03:16:13.278669', 'step': 464, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:13.311353', 'step': 464, 'epoch': 1} {'type': 'loss', 'content': 0.010045178234577179, 'timestamp': '2025-09-15 03:16:13.313416', 'step': 465, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:13.343864', 'step': 465, 'epoch': 1} {'type': 'loss', 'content': 0.0036023962311446667, 'timestamp': '2025-09-15 03:16:13.345903', 'step': 466, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:13.376424', 'step': 466, 'epoch': 1} {'type': 'loss', 'content': 0.0018648395780473948, 'timestamp': '2025-09-15 03:16:13.378860', 'step': 467, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:13.409600', 'step': 467, 'epoch': 1} {'type': 'loss', 'content': 0.01642528362572193, 'timestamp': '2025-09-15 03:16:13.433408', 'step': 468, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:16:14.018984', 'step': 468, 'epoch': 1} {'type': 'pplx', 'content': 89994610.81027079, 'timestamp': '2025-09-15 03:16:14.020991', 'step': 468, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:14.049924', 'step': 468, 'epoch': 1} {'type': 'loss', 'content': 0.02225838229060173, 'timestamp': '2025-09-15 03:16:14.051901', 'step': 469, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:14.082434', 'step': 469, 'epoch': 1} {'type': 'loss', 'content': 0.012944989837706089, 'timestamp': '2025-09-15 03:16:14.084750', 'step': 470, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:14.116114', 'step': 470, 'epoch': 1} {'type': 'loss', 'content': 0.0015818104147911072, 'timestamp': '2025-09-15 03:16:14.118321', 'step': 471, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:14.148756', 'step': 471, 'epoch': 1} {'type': 'loss', 'content': 0.01057495642453432, 'timestamp': '2025-09-15 03:16:14.173721', 'step': 472, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:14.204761', 'step': 472, 'epoch': 1} {'type': 'loss', 'content': 0.03044239804148674, 'timestamp': '2025-09-15 03:16:14.209014', 'step': 473, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:14.239329', 'step': 473, 'epoch': 1} {'type': 'loss', 'content': 0.02977249026298523, 'timestamp': '2025-09-15 03:16:14.241306', 'step': 474, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:14.271617', 'step': 474, 'epoch': 1} {'type': 'loss', 'content': 0.0017273669363930821, 'timestamp': '2025-09-15 03:16:14.275784', 'step': 475, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:14.307221', 'step': 475, 'epoch': 1} {'type': 'loss', 'content': 0.025188205763697624, 'timestamp': '2025-09-15 03:16:14.334889', 'step': 476, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:14.365106', 'step': 476, 'epoch': 1} {'type': 'loss', 'content': 0.03656993433833122, 'timestamp': '2025-09-15 03:16:14.367088', 'step': 477, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:14.397260', 'step': 477, 'epoch': 1} {'type': 'loss', 'content': 0.03108203411102295, 'timestamp': '2025-09-15 03:16:14.399261', 'step': 478, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:14.430904', 'step': 478, 'epoch': 1} {'type': 'loss', 'content': 0.03353414312005043, 'timestamp': '2025-09-15 03:16:14.434707', 'step': 479, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:14.464683', 'step': 479, 'epoch': 1} {'type': 'loss', 'content': 0.003992644604295492, 'timestamp': '2025-09-15 03:16:14.488349', 'step': 480, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:14.519279', 'step': 480, 'epoch': 1} {'type': 'loss', 'content': 0.006703828927129507, 'timestamp': '2025-09-15 03:16:14.521283', 'step': 481, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:14.552148', 'step': 481, 'epoch': 1} {'type': 'loss', 'content': 0.0031639602966606617, 'timestamp': '2025-09-15 03:16:14.555726', 'step': 482, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:14.586911', 'step': 482, 'epoch': 1} {'type': 'loss', 'content': 0.03814245015382767, 'timestamp': '2025-09-15 03:16:14.590632', 'step': 483, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:14.621214', 'step': 483, 'epoch': 1} {'type': 'loss', 'content': 0.017492132261395454, 'timestamp': '2025-09-15 03:16:14.644981', 'step': 484, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-15 03:16:14.676050', 'step': 484, 'epoch': 1} {'type': 'loss', 'content': 0.022614041343331337, 'timestamp': '2025-09-15 03:16:14.683270', 'step': 485, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:14.713773', 'step': 485, 'epoch': 1} {'type': 'loss', 'content': 0.011417665518820286, 'timestamp': '2025-09-15 03:16:14.715931', 'step': 486, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:14.747103', 'step': 486, 'epoch': 1} {'type': 'loss', 'content': 0.010924013331532478, 'timestamp': '2025-09-15 03:16:14.749320', 'step': 487, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:14.780206', 'step': 487, 'epoch': 1} {'type': 'loss', 'content': 0.013234901241958141, 'timestamp': '2025-09-15 03:16:14.803725', 'step': 488, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:16:14.834583', 'step': 488, 'epoch': 1} {'type': 'loss', 'content': 0.006062183063477278, 'timestamp': '2025-09-15 03:16:14.836613', 'step': 489, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:14.866954', 'step': 489, 'epoch': 1} {'type': 'loss', 'content': 0.009334629401564598, 'timestamp': '2025-09-15 03:16:14.870820', 'step': 490, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:14.901669', 'step': 490, 'epoch': 1} {'type': 'loss', 'content': 0.015576253645122051, 'timestamp': '2025-09-15 03:16:14.903654', 'step': 491, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:14.935197', 'step': 491, 'epoch': 1} {'type': 'loss', 'content': 0.006126120686531067, 'timestamp': '2025-09-15 03:16:14.958725', 'step': 492, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-15 03:16:14.990060', 'step': 492, 'epoch': 1} {'type': 'loss', 'content': 0.0050699966959655285, 'timestamp': '2025-09-15 03:16:14.995151', 'step': 493, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:15.026200', 'step': 493, 'epoch': 1} {'type': 'loss', 'content': 0.015693241730332375, 'timestamp': '2025-09-15 03:16:15.029936', 'step': 494, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:15.060515', 'step': 494, 'epoch': 1} {'type': 'loss', 'content': 0.009532543830573559, 'timestamp': '2025-09-15 03:16:15.062630', 'step': 495, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:15.092879', 'step': 495, 'epoch': 1} {'type': 'loss', 'content': 0.02666117250919342, 'timestamp': '2025-09-15 03:16:15.118090', 'step': 496, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:15.148832', 'step': 496, 'epoch': 1} {'type': 'loss', 'content': 0.03242836147546768, 'timestamp': '2025-09-15 03:16:15.150824', 'step': 497, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-15 03:16:15.181371', 'step': 497, 'epoch': 1} {'type': 'loss', 'content': 0.020675910636782646, 'timestamp': '2025-09-15 03:16:15.188732', 'step': 498, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:15.219467', 'step': 498, 'epoch': 1} {'type': 'loss', 'content': 0.018618302419781685, 'timestamp': '2025-09-15 03:16:15.221549', 'step': 499, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:15.252547', 'step': 499, 'epoch': 1} {'type': 'loss', 'content': 0.021370384842157364, 'timestamp': '2025-09-15 03:16:15.279935', 'step': 500, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 500', 'timestamp': '2025-09-15 03:16:21.950577', 'step': 500, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:21.988100', 'step': 500, 'epoch': 1} {'type': 'loss', 'content': 0.0373503677546978, 'timestamp': '2025-09-15 03:16:21.990113', 'step': 501, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:22.024094', 'step': 501, 'epoch': 1} {'type': 'loss', 'content': 0.0026365232188254595, 'timestamp': '2025-09-15 03:16:22.026097', 'step': 502, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-15 03:16:22.057196', 'step': 502, 'epoch': 1} {'type': 'loss', 'content': 0.01669916883111, 'timestamp': '2025-09-15 03:16:22.068808', 'step': 503, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:22.099620', 'step': 503, 'epoch': 1} {'type': 'loss', 'content': 0.023004433140158653, 'timestamp': '2025-09-15 03:16:22.124198', 'step': 504, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:22.154621', 'step': 504, 'epoch': 1} {'type': 'loss', 'content': 0.03475857526063919, 'timestamp': '2025-09-15 03:16:22.156604', 'step': 505, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:22.187632', 'step': 505, 'epoch': 1} {'type': 'loss', 'content': 0.027279671281576157, 'timestamp': '2025-09-15 03:16:22.191592', 'step': 506, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:22.224447', 'step': 506, 'epoch': 1} {'type': 'loss', 'content': 0.010024458169937134, 'timestamp': '2025-09-15 03:16:22.231492', 'step': 507, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:16:22.823830', 'step': 507, 'epoch': 1} {'type': 'pplx', 'content': 91882933.03724283, 'timestamp': '2025-09-15 03:16:22.826008', 'step': 507, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:22.855877', 'step': 507, 'epoch': 1} {'type': 'loss', 'content': 0.03937874361872673, 'timestamp': '2025-09-15 03:16:22.879687', 'step': 508, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:22.910646', 'step': 508, 'epoch': 1} {'type': 'loss', 'content': 0.009307787753641605, 'timestamp': '2025-09-15 03:16:22.912493', 'step': 509, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:22.943593', 'step': 509, 'epoch': 1} {'type': 'loss', 'content': 0.007812165655195713, 'timestamp': '2025-09-15 03:16:22.949903', 'step': 510, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:22.980832', 'step': 510, 'epoch': 1} {'type': 'loss', 'content': 0.014223689213395119, 'timestamp': '2025-09-15 03:16:22.982851', 'step': 511, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:23.013914', 'step': 511, 'epoch': 1} {'type': 'loss', 'content': 0.034484755247831345, 'timestamp': '2025-09-15 03:16:23.038861', 'step': 512, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-15 03:16:23.070240', 'step': 512, 'epoch': 1} {'type': 'loss', 'content': 0.051926709711551666, 'timestamp': '2025-09-15 03:16:23.075215', 'step': 513, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:23.105999', 'step': 513, 'epoch': 1} {'type': 'loss', 'content': 0.0070337154902517796, 'timestamp': '2025-09-15 03:16:23.109788', 'step': 514, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:23.141717', 'step': 514, 'epoch': 1} {'type': 'loss', 'content': 0.0057702637277543545, 'timestamp': '2025-09-15 03:16:23.148368', 'step': 515, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 80], 'flops': 2373281365952}, 'timestamp': '2025-09-15 03:16:23.193173', 'step': 515, 'epoch': 1} {'type': 'loss', 'content': 0.010492673143744469, 'timestamp': '2025-09-15 03:16:23.217013', 'step': 516, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:16:23.248013', 'step': 516, 'epoch': 1} {'type': 'loss', 'content': 0.020993739366531372, 'timestamp': '2025-09-15 03:16:23.249895', 'step': 517, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:23.281199', 'step': 517, 'epoch': 1} {'type': 'loss', 'content': 0.005516039673238993, 'timestamp': '2025-09-15 03:16:23.287803', 'step': 518, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:23.318976', 'step': 518, 'epoch': 1} {'type': 'loss', 'content': 0.014884904026985168, 'timestamp': '2025-09-15 03:16:23.322711', 'step': 519, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:23.353328', 'step': 519, 'epoch': 1} {'type': 'loss', 'content': 0.016969164833426476, 'timestamp': '2025-09-15 03:16:23.376911', 'step': 520, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:23.407828', 'step': 520, 'epoch': 1} {'type': 'loss', 'content': 0.017451364547014236, 'timestamp': '2025-09-15 03:16:23.410920', 'step': 521, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:23.442614', 'step': 521, 'epoch': 1} {'type': 'loss', 'content': 0.0072836605831980705, 'timestamp': '2025-09-15 03:16:23.444596', 'step': 522, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:23.475148', 'step': 522, 'epoch': 1} {'type': 'loss', 'content': 0.03784263879060745, 'timestamp': '2025-09-15 03:16:23.479224', 'step': 523, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:23.510259', 'step': 523, 'epoch': 1} {'type': 'loss', 'content': 0.009146297350525856, 'timestamp': '2025-09-15 03:16:23.537615', 'step': 524, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:23.568894', 'step': 524, 'epoch': 1} {'type': 'loss', 'content': 0.014295176602900028, 'timestamp': '2025-09-15 03:16:23.570854', 'step': 525, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:23.602323', 'step': 525, 'epoch': 1} {'type': 'loss', 'content': 0.013233968988060951, 'timestamp': '2025-09-15 03:16:23.606343', 'step': 526, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:23.637385', 'step': 526, 'epoch': 1} {'type': 'loss', 'content': 0.023095833137631416, 'timestamp': '2025-09-15 03:16:23.643916', 'step': 527, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:23.674712', 'step': 527, 'epoch': 1} {'type': 'loss', 'content': 0.009187877178192139, 'timestamp': '2025-09-15 03:16:23.702281', 'step': 528, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:23.733249', 'step': 528, 'epoch': 1} {'type': 'loss', 'content': 0.03069307468831539, 'timestamp': '2025-09-15 03:16:23.735035', 'step': 529, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:23.766431', 'step': 529, 'epoch': 1} {'type': 'loss', 'content': 0.026559680700302124, 'timestamp': '2025-09-15 03:16:23.768463', 'step': 530, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:23.800597', 'step': 530, 'epoch': 1} {'type': 'loss', 'content': 0.010563480667769909, 'timestamp': '2025-09-15 03:16:23.803675', 'step': 531, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:23.835335', 'step': 531, 'epoch': 1} {'type': 'loss', 'content': 0.014871816150844097, 'timestamp': '2025-09-15 03:16:23.858918', 'step': 532, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:23.890289', 'step': 532, 'epoch': 1} {'type': 'loss', 'content': 0.0237137358635664, 'timestamp': '2025-09-15 03:16:23.892517', 'step': 533, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:23.922951', 'step': 533, 'epoch': 1} {'type': 'loss', 'content': 0.00401802035048604, 'timestamp': '2025-09-15 03:16:23.926668', 'step': 534, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:23.958168', 'step': 534, 'epoch': 1} {'type': 'loss', 'content': 0.011716380715370178, 'timestamp': '2025-09-15 03:16:23.960693', 'step': 535, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:23.992109', 'step': 535, 'epoch': 1} {'type': 'loss', 'content': 0.0034951730631291866, 'timestamp': '2025-09-15 03:16:24.016560', 'step': 536, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:24.049158', 'step': 536, 'epoch': 1} {'type': 'loss', 'content': 0.013428243808448315, 'timestamp': '2025-09-15 03:16:24.051200', 'step': 537, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:24.081795', 'step': 537, 'epoch': 1} {'type': 'loss', 'content': 0.018913377076387405, 'timestamp': '2025-09-15 03:16:24.085679', 'step': 538, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:24.116421', 'step': 538, 'epoch': 1} {'type': 'loss', 'content': 0.008952634409070015, 'timestamp': '2025-09-15 03:16:24.122736', 'step': 539, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:24.153800', 'step': 539, 'epoch': 1} {'type': 'loss', 'content': 0.010391423478722572, 'timestamp': '2025-09-15 03:16:24.178636', 'step': 540, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:24.215531', 'step': 540, 'epoch': 1} {'type': 'loss', 'content': 0.033783040940761566, 'timestamp': '2025-09-15 03:16:24.219507', 'step': 541, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:24.256786', 'step': 541, 'epoch': 1} {'type': 'loss', 'content': 0.018627535551786423, 'timestamp': '2025-09-15 03:16:24.259212', 'step': 542, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:24.289674', 'step': 542, 'epoch': 1} {'type': 'loss', 'content': 0.022923845797777176, 'timestamp': '2025-09-15 03:16:24.293451', 'step': 543, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:16:24.323991', 'step': 543, 'epoch': 1} {'type': 'loss', 'content': 0.014477974735200405, 'timestamp': '2025-09-15 03:16:24.347530', 'step': 544, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:24.379793', 'step': 544, 'epoch': 1} {'type': 'loss', 'content': 0.010905615985393524, 'timestamp': '2025-09-15 03:16:24.384797', 'step': 545, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:24.424449', 'step': 545, 'epoch': 1} {'type': 'loss', 'content': 0.015100879594683647, 'timestamp': '2025-09-15 03:16:24.431849', 'step': 546, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:16:25.056448', 'step': 546, 'epoch': 1} {'type': 'pplx', 'content': 89409619.26869535, 'timestamp': '2025-09-15 03:16:25.058542', 'step': 546, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:25.094970', 'step': 546, 'epoch': 1} {'type': 'loss', 'content': 0.019579365849494934, 'timestamp': '2025-09-15 03:16:25.100875', 'step': 547, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:25.131972', 'step': 547, 'epoch': 1} {'type': 'loss', 'content': 0.014669536612927914, 'timestamp': '2025-09-15 03:16:25.155773', 'step': 548, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:25.186873', 'step': 548, 'epoch': 1} {'type': 'loss', 'content': 0.007602573838084936, 'timestamp': '2025-09-15 03:16:25.189133', 'step': 549, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:25.222238', 'step': 549, 'epoch': 1} {'type': 'loss', 'content': 0.01011333055794239, 'timestamp': '2025-09-15 03:16:25.224225', 'step': 550, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:25.254253', 'step': 550, 'epoch': 1} {'type': 'loss', 'content': 0.008165261708199978, 'timestamp': '2025-09-15 03:16:25.256283', 'step': 551, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:25.287103', 'step': 551, 'epoch': 1} {'type': 'loss', 'content': 0.011216294951736927, 'timestamp': '2025-09-15 03:16:25.314768', 'step': 552, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:25.346143', 'step': 552, 'epoch': 1} {'type': 'loss', 'content': 0.016402242705225945, 'timestamp': '2025-09-15 03:16:25.348227', 'step': 553, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:25.379545', 'step': 553, 'epoch': 1} {'type': 'loss', 'content': 0.012566662393510342, 'timestamp': '2025-09-15 03:16:25.382634', 'step': 554, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:16:25.413335', 'step': 554, 'epoch': 1} {'type': 'loss', 'content': 0.010999364778399467, 'timestamp': '2025-09-15 03:16:25.415460', 'step': 555, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:25.447215', 'step': 555, 'epoch': 1} {'type': 'loss', 'content': 0.01768830046057701, 'timestamp': '2025-09-15 03:16:25.472171', 'step': 556, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:25.509057', 'step': 556, 'epoch': 1} {'type': 'loss', 'content': 0.010185576975345612, 'timestamp': '2025-09-15 03:16:25.511060', 'step': 557, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:25.543175', 'step': 557, 'epoch': 1} {'type': 'loss', 'content': 0.008643836714327335, 'timestamp': '2025-09-15 03:16:25.547164', 'step': 558, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:25.592428', 'step': 558, 'epoch': 1} {'type': 'loss', 'content': 0.010841822251677513, 'timestamp': '2025-09-15 03:16:25.594492', 'step': 559, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:25.625665', 'step': 559, 'epoch': 1} {'type': 'loss', 'content': 0.011339940130710602, 'timestamp': '2025-09-15 03:16:25.650238', 'step': 560, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:25.681804', 'step': 560, 'epoch': 1} {'type': 'loss', 'content': 0.021963687613606453, 'timestamp': '2025-09-15 03:16:25.685664', 'step': 561, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:25.716629', 'step': 561, 'epoch': 1} {'type': 'loss', 'content': 0.007223943714052439, 'timestamp': '2025-09-15 03:16:25.722988', 'step': 562, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:25.755056', 'step': 562, 'epoch': 1} {'type': 'loss', 'content': 0.020964499562978745, 'timestamp': '2025-09-15 03:16:25.757256', 'step': 563, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:25.788135', 'step': 563, 'epoch': 1} {'type': 'loss', 'content': 0.011233599856495857, 'timestamp': '2025-09-15 03:16:25.812979', 'step': 564, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:25.845622', 'step': 564, 'epoch': 1} {'type': 'loss', 'content': 0.010387702845036983, 'timestamp': '2025-09-15 03:16:25.849734', 'step': 565, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:25.880895', 'step': 565, 'epoch': 1} {'type': 'loss', 'content': 0.0083076860755682, 'timestamp': '2025-09-15 03:16:25.882878', 'step': 566, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:25.913724', 'step': 566, 'epoch': 1} {'type': 'loss', 'content': 0.008273184299468994, 'timestamp': '2025-09-15 03:16:25.915818', 'step': 567, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:25.947138', 'step': 567, 'epoch': 1} {'type': 'loss', 'content': 0.029909178614616394, 'timestamp': '2025-09-15 03:16:25.974634', 'step': 568, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:16:26.005839', 'step': 568, 'epoch': 1} {'type': 'loss', 'content': 0.002194339642301202, 'timestamp': '2025-09-15 03:16:26.007911', 'step': 569, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:26.038846', 'step': 569, 'epoch': 1} {'type': 'loss', 'content': 0.01601494289934635, 'timestamp': '2025-09-15 03:16:26.040877', 'step': 570, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:26.074163', 'step': 570, 'epoch': 1} {'type': 'loss', 'content': 0.0036689683329313993, 'timestamp': '2025-09-15 03:16:26.080349', 'step': 571, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:26.111727', 'step': 571, 'epoch': 1} {'type': 'loss', 'content': 0.004763443022966385, 'timestamp': '2025-09-15 03:16:26.136385', 'step': 572, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:26.166662', 'step': 572, 'epoch': 1} {'type': 'loss', 'content': 0.017256123945116997, 'timestamp': '2025-09-15 03:16:26.168681', 'step': 573, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:26.200094', 'step': 573, 'epoch': 1} {'type': 'loss', 'content': 0.0017653962131589651, 'timestamp': '2025-09-15 03:16:26.206329', 'step': 574, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:26.237036', 'step': 574, 'epoch': 1} {'type': 'loss', 'content': 0.0193465743213892, 'timestamp': '2025-09-15 03:16:26.244267', 'step': 575, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:26.276743', 'step': 575, 'epoch': 1} {'type': 'loss', 'content': 0.004393164534121752, 'timestamp': '2025-09-15 03:16:26.300332', 'step': 576, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:26.330817', 'step': 576, 'epoch': 1} {'type': 'loss', 'content': 0.0037941287737339735, 'timestamp': '2025-09-15 03:16:26.332938', 'step': 577, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:26.363896', 'step': 577, 'epoch': 1} {'type': 'loss', 'content': 0.0058485642075538635, 'timestamp': '2025-09-15 03:16:26.367504', 'step': 578, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:26.398604', 'step': 578, 'epoch': 1} {'type': 'loss', 'content': 0.025454718619585037, 'timestamp': '2025-09-15 03:16:26.405155', 'step': 579, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:26.435732', 'step': 579, 'epoch': 1} {'type': 'loss', 'content': 0.007020084653049707, 'timestamp': '2025-09-15 03:16:26.460490', 'step': 580, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:26.490814', 'step': 580, 'epoch': 1} {'type': 'loss', 'content': 0.009553035721182823, 'timestamp': '2025-09-15 03:16:26.492897', 'step': 581, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:26.523780', 'step': 581, 'epoch': 1} {'type': 'loss', 'content': 0.008278349414467812, 'timestamp': '2025-09-15 03:16:26.531010', 'step': 582, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:26.562056', 'step': 582, 'epoch': 1} {'type': 'loss', 'content': 0.0093050766736269, 'timestamp': '2025-09-15 03:16:26.565876', 'step': 583, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:26.596585', 'step': 583, 'epoch': 1} {'type': 'loss', 'content': 0.010887989774346352, 'timestamp': '2025-09-15 03:16:26.621409', 'step': 584, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:26.652519', 'step': 584, 'epoch': 1} {'type': 'loss', 'content': 0.03535384312272072, 'timestamp': '2025-09-15 03:16:26.654693', 'step': 585, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:16:27.242879', 'step': 585, 'epoch': 1} {'type': 'pplx', 'content': 98861864.54475525, 'timestamp': '2025-09-15 03:16:27.245026', 'step': 585, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:16:27.274538', 'step': 585, 'epoch': 1} {'type': 'loss', 'content': 0.004063038155436516, 'timestamp': '2025-09-15 03:16:27.276485', 'step': 586, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:27.307699', 'step': 586, 'epoch': 1} {'type': 'loss', 'content': 0.010096131823956966, 'timestamp': '2025-09-15 03:16:27.311677', 'step': 587, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:27.342110', 'step': 587, 'epoch': 1} {'type': 'loss', 'content': 0.03615206480026245, 'timestamp': '2025-09-15 03:16:27.365904', 'step': 588, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:27.397509', 'step': 588, 'epoch': 1} {'type': 'loss', 'content': 0.011354520916938782, 'timestamp': '2025-09-15 03:16:27.399931', 'step': 589, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:27.431176', 'step': 589, 'epoch': 1} {'type': 'loss', 'content': 0.028571562841534615, 'timestamp': '2025-09-15 03:16:27.433426', 'step': 590, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:27.464686', 'step': 590, 'epoch': 1} {'type': 'loss', 'content': 0.02120061218738556, 'timestamp': '2025-09-15 03:16:27.468582', 'step': 591, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:27.499303', 'step': 591, 'epoch': 1} {'type': 'loss', 'content': 0.017664339393377304, 'timestamp': '2025-09-15 03:16:27.526697', 'step': 592, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:27.567770', 'step': 592, 'epoch': 1} {'type': 'loss', 'content': 0.024698514491319656, 'timestamp': '2025-09-15 03:16:27.569993', 'step': 593, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:16:27.600896', 'step': 593, 'epoch': 1} {'type': 'loss', 'content': 0.017938822507858276, 'timestamp': '2025-09-15 03:16:27.603229', 'step': 594, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:27.636026', 'step': 594, 'epoch': 1} {'type': 'loss', 'content': 0.021319078281521797, 'timestamp': '2025-09-15 03:16:27.639547', 'step': 595, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:27.670627', 'step': 595, 'epoch': 1} {'type': 'loss', 'content': 0.016344984993338585, 'timestamp': '2025-09-15 03:16:27.694665', 'step': 596, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:27.729956', 'step': 596, 'epoch': 1} {'type': 'loss', 'content': 0.03623601421713829, 'timestamp': '2025-09-15 03:16:27.732064', 'step': 597, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:16:27.762421', 'step': 597, 'epoch': 1} {'type': 'loss', 'content': 0.008179962635040283, 'timestamp': '2025-09-15 03:16:27.765476', 'step': 598, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:27.808792', 'step': 598, 'epoch': 1} {'type': 'loss', 'content': 0.04286444932222366, 'timestamp': '2025-09-15 03:16:27.810768', 'step': 599, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-15 03:16:27.841669', 'step': 599, 'epoch': 1} {'type': 'loss', 'content': 0.002624726388603449, 'timestamp': '2025-09-15 03:16:27.869981', 'step': 600, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:27.901387', 'step': 600, 'epoch': 1} {'type': 'loss', 'content': 0.020307507365942, 'timestamp': '2025-09-15 03:16:27.903524', 'step': 601, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:27.934720', 'step': 601, 'epoch': 1} {'type': 'loss', 'content': 0.011326144449412823, 'timestamp': '2025-09-15 03:16:27.940844', 'step': 602, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:27.971864', 'step': 602, 'epoch': 1} {'type': 'loss', 'content': 0.014740855433046818, 'timestamp': '2025-09-15 03:16:27.974138', 'step': 603, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:28.004842', 'step': 603, 'epoch': 1} {'type': 'loss', 'content': 0.005876082926988602, 'timestamp': '2025-09-15 03:16:28.029876', 'step': 604, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:28.072400', 'step': 604, 'epoch': 1} {'type': 'loss', 'content': 0.020726444199681282, 'timestamp': '2025-09-15 03:16:28.075612', 'step': 605, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:28.109917', 'step': 605, 'epoch': 1} {'type': 'loss', 'content': 0.0367356613278389, 'timestamp': '2025-09-15 03:16:28.112013', 'step': 606, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:28.143701', 'step': 606, 'epoch': 1} {'type': 'loss', 'content': 0.04382667690515518, 'timestamp': '2025-09-15 03:16:28.145851', 'step': 607, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:28.176470', 'step': 607, 'epoch': 1} {'type': 'loss', 'content': 0.01624279096722603, 'timestamp': '2025-09-15 03:16:28.200157', 'step': 608, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:28.235642', 'step': 608, 'epoch': 1} {'type': 'loss', 'content': 0.033406250178813934, 'timestamp': '2025-09-15 03:16:28.241303', 'step': 609, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:28.273855', 'step': 609, 'epoch': 1} {'type': 'loss', 'content': 0.008596173487603664, 'timestamp': '2025-09-15 03:16:28.276174', 'step': 610, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:28.306420', 'step': 610, 'epoch': 1} {'type': 'loss', 'content': 0.015879623591899872, 'timestamp': '2025-09-15 03:16:28.311014', 'step': 611, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:28.341507', 'step': 611, 'epoch': 1} {'type': 'loss', 'content': 0.00993258785456419, 'timestamp': '2025-09-15 03:16:28.369275', 'step': 612, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:28.401614', 'step': 612, 'epoch': 1} {'type': 'loss', 'content': 0.016361379995942116, 'timestamp': '2025-09-15 03:16:28.405897', 'step': 613, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:16:28.436227', 'step': 613, 'epoch': 1} {'type': 'loss', 'content': 0.015362887643277645, 'timestamp': '2025-09-15 03:16:28.438569', 'step': 614, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:28.468674', 'step': 614, 'epoch': 1} {'type': 'loss', 'content': 0.016249025240540504, 'timestamp': '2025-09-15 03:16:28.475849', 'step': 615, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:28.509620', 'step': 615, 'epoch': 1} {'type': 'loss', 'content': 0.04643365740776062, 'timestamp': '2025-09-15 03:16:28.533483', 'step': 616, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:28.568172', 'step': 616, 'epoch': 1} {'type': 'loss', 'content': 0.023818328976631165, 'timestamp': '2025-09-15 03:16:28.572843', 'step': 617, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:28.604918', 'step': 617, 'epoch': 1} {'type': 'loss', 'content': 0.02747488021850586, 'timestamp': '2025-09-15 03:16:28.607219', 'step': 618, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:28.639406', 'step': 618, 'epoch': 1} {'type': 'loss', 'content': 0.01822894625365734, 'timestamp': '2025-09-15 03:16:28.644272', 'step': 619, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:28.674899', 'step': 619, 'epoch': 1} {'type': 'loss', 'content': 0.011014273390173912, 'timestamp': '2025-09-15 03:16:28.700046', 'step': 620, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:28.730674', 'step': 620, 'epoch': 1} {'type': 'loss', 'content': 0.018967941403388977, 'timestamp': '2025-09-15 03:16:28.732984', 'step': 621, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:28.763784', 'step': 621, 'epoch': 1} {'type': 'loss', 'content': 0.003352448111400008, 'timestamp': '2025-09-15 03:16:28.770669', 'step': 622, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [2, 192], 'flops': 2847885110400}, 'timestamp': '2025-09-15 03:16:28.807097', 'step': 622, 'epoch': 1} {'type': 'loss', 'content': 0.02111704833805561, 'timestamp': '2025-09-15 03:16:28.809129', 'step': 623, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:28.857756', 'step': 623, 'epoch': 2} {'type': 'loss', 'content': 0.015314899384975433, 'timestamp': '2025-09-15 03:16:28.881610', 'step': 624, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:16:29.781314', 'step': 624, 'epoch': 2} {'type': 'pplx', 'content': 82289978.05812527, 'timestamp': '2025-09-15 03:16:29.783394', 'step': 624, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:29.828336', 'step': 624, 'epoch': 2} {'type': 'loss', 'content': 0.02048143930733204, 'timestamp': '2025-09-15 03:16:29.830776', 'step': 625, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:29.894938', 'step': 625, 'epoch': 2} {'type': 'loss', 'content': 0.009616588242352009, 'timestamp': '2025-09-15 03:16:29.899160', 'step': 626, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:29.929697', 'step': 626, 'epoch': 2} {'type': 'loss', 'content': 0.008175384253263474, 'timestamp': '2025-09-15 03:16:29.934138', 'step': 627, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:29.966024', 'step': 627, 'epoch': 2} {'type': 'loss', 'content': 0.003119382541626692, 'timestamp': '2025-09-15 03:16:29.994430', 'step': 628, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:30.026722', 'step': 628, 'epoch': 2} {'type': 'loss', 'content': 0.013358536176383495, 'timestamp': '2025-09-15 03:16:30.028860', 'step': 629, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:16:30.062870', 'step': 629, 'epoch': 2} {'type': 'loss', 'content': 0.011693683452904224, 'timestamp': '2025-09-15 03:16:30.064956', 'step': 630, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:30.110879', 'step': 630, 'epoch': 2} {'type': 'loss', 'content': 0.03411411494016647, 'timestamp': '2025-09-15 03:16:30.115525', 'step': 631, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:16:30.148633', 'step': 631, 'epoch': 2} {'type': 'loss', 'content': 0.009591693058609962, 'timestamp': '2025-09-15 03:16:30.172452', 'step': 632, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:30.215888', 'step': 632, 'epoch': 2} {'type': 'loss', 'content': 0.012655379250645638, 'timestamp': '2025-09-15 03:16:30.217968', 'step': 633, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:30.258789', 'step': 633, 'epoch': 2} {'type': 'loss', 'content': 0.04086309298872948, 'timestamp': '2025-09-15 03:16:30.266120', 'step': 634, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:30.318110', 'step': 634, 'epoch': 2} {'type': 'loss', 'content': 0.009325115010142326, 'timestamp': '2025-09-15 03:16:30.322378', 'step': 635, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:30.389630', 'step': 635, 'epoch': 2} {'type': 'loss', 'content': 0.007273114286363125, 'timestamp': '2025-09-15 03:16:30.413627', 'step': 636, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:30.460387', 'step': 636, 'epoch': 2} {'type': 'loss', 'content': 0.025864509865641594, 'timestamp': '2025-09-15 03:16:30.462396', 'step': 637, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:30.508504', 'step': 637, 'epoch': 2} {'type': 'loss', 'content': 0.021363092586398125, 'timestamp': '2025-09-15 03:16:30.513055', 'step': 638, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:30.552361', 'step': 638, 'epoch': 2} {'type': 'loss', 'content': 0.02703591249883175, 'timestamp': '2025-09-15 03:16:30.555609', 'step': 639, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:30.597540', 'step': 639, 'epoch': 2} {'type': 'loss', 'content': 0.017300015315413475, 'timestamp': '2025-09-15 03:16:30.622644', 'step': 640, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:30.692036', 'step': 640, 'epoch': 2} {'type': 'loss', 'content': 0.009818075224757195, 'timestamp': '2025-09-15 03:16:30.696861', 'step': 641, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:16:30.738126', 'step': 641, 'epoch': 2} {'type': 'loss', 'content': 0.01826951466500759, 'timestamp': '2025-09-15 03:16:30.740509', 'step': 642, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:30.787648', 'step': 642, 'epoch': 2} {'type': 'loss', 'content': 0.01782812550663948, 'timestamp': '2025-09-15 03:16:30.794983', 'step': 643, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:30.843831', 'step': 643, 'epoch': 2} {'type': 'loss', 'content': 0.017956728115677834, 'timestamp': '2025-09-15 03:16:30.872155', 'step': 644, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:30.920502', 'step': 644, 'epoch': 2} {'type': 'loss', 'content': 0.01835721917450428, 'timestamp': '2025-09-15 03:16:30.922427', 'step': 645, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:30.974228', 'step': 645, 'epoch': 2} {'type': 'loss', 'content': 0.023359332233667374, 'timestamp': '2025-09-15 03:16:30.976342', 'step': 646, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:31.035373', 'step': 646, 'epoch': 2} {'type': 'loss', 'content': 0.03923198580741882, 'timestamp': '2025-09-15 03:16:31.040605', 'step': 647, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:31.083255', 'step': 647, 'epoch': 2} {'type': 'loss', 'content': 0.005954307969659567, 'timestamp': '2025-09-15 03:16:31.111450', 'step': 648, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-15 03:16:31.151549', 'step': 648, 'epoch': 2} {'type': 'loss', 'content': 0.006974720861762762, 'timestamp': '2025-09-15 03:16:31.157511', 'step': 649, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:31.190714', 'step': 649, 'epoch': 2} {'type': 'loss', 'content': 0.03683912381529808, 'timestamp': '2025-09-15 03:16:31.195443', 'step': 650, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:31.226021', 'step': 650, 'epoch': 2} {'type': 'loss', 'content': 0.027897637337446213, 'timestamp': '2025-09-15 03:16:31.228874', 'step': 651, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:31.259785', 'step': 651, 'epoch': 2} {'type': 'loss', 'content': 0.012562957592308521, 'timestamp': '2025-09-15 03:16:31.288342', 'step': 652, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:31.318162', 'step': 652, 'epoch': 2} {'type': 'loss', 'content': 0.007003194652497768, 'timestamp': '2025-09-15 03:16:31.320261', 'step': 653, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:31.352551', 'step': 653, 'epoch': 2} {'type': 'loss', 'content': 0.00510317413136363, 'timestamp': '2025-09-15 03:16:31.357365', 'step': 654, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:31.387314', 'step': 654, 'epoch': 2} {'type': 'loss', 'content': 0.020198477432131767, 'timestamp': '2025-09-15 03:16:31.389518', 'step': 655, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:31.422067', 'step': 655, 'epoch': 2} {'type': 'loss', 'content': 0.016794826835393906, 'timestamp': '2025-09-15 03:16:31.450534', 'step': 656, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:31.480399', 'step': 656, 'epoch': 2} {'type': 'loss', 'content': 0.02333107963204384, 'timestamp': '2025-09-15 03:16:31.485657', 'step': 657, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:31.516024', 'step': 657, 'epoch': 2} {'type': 'loss', 'content': 0.013125887140631676, 'timestamp': '2025-09-15 03:16:31.521153', 'step': 658, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:31.551427', 'step': 658, 'epoch': 2} {'type': 'loss', 'content': 0.011822559870779514, 'timestamp': '2025-09-15 03:16:31.554582', 'step': 659, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:31.584092', 'step': 659, 'epoch': 2} {'type': 'loss', 'content': 0.018171224743127823, 'timestamp': '2025-09-15 03:16:31.607707', 'step': 660, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:31.637972', 'step': 660, 'epoch': 2} {'type': 'loss', 'content': 0.017307527363300323, 'timestamp': '2025-09-15 03:16:31.640046', 'step': 661, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:31.669186', 'step': 661, 'epoch': 2} {'type': 'loss', 'content': 0.01217662263661623, 'timestamp': '2025-09-15 03:16:31.671433', 'step': 662, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:31.701554', 'step': 662, 'epoch': 2} {'type': 'loss', 'content': 0.015428896062076092, 'timestamp': '2025-09-15 03:16:31.707209', 'step': 663, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:16:32.293684', 'step': 663, 'epoch': 2} {'type': 'pplx', 'content': 75053295.7847995, 'timestamp': '2025-09-15 03:16:32.295602', 'step': 663, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:32.324065', 'step': 663, 'epoch': 2} {'type': 'loss', 'content': 0.01401414256542921, 'timestamp': '2025-09-15 03:16:32.352081', 'step': 664, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:32.382447', 'step': 664, 'epoch': 2} {'type': 'loss', 'content': 0.025133652612566948, 'timestamp': '2025-09-15 03:16:32.385035', 'step': 665, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:32.414441', 'step': 665, 'epoch': 2} {'type': 'loss', 'content': 0.010838499292731285, 'timestamp': '2025-09-15 03:16:32.417702', 'step': 666, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:32.447082', 'step': 666, 'epoch': 2} {'type': 'loss', 'content': 0.04639563336968422, 'timestamp': '2025-09-15 03:16:32.454729', 'step': 667, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:32.484473', 'step': 667, 'epoch': 2} {'type': 'loss', 'content': 0.006913153920322657, 'timestamp': '2025-09-15 03:16:32.507752', 'step': 668, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:32.537939', 'step': 668, 'epoch': 2} {'type': 'loss', 'content': 0.01731266640126705, 'timestamp': '2025-09-15 03:16:32.540688', 'step': 669, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:32.570215', 'step': 669, 'epoch': 2} {'type': 'loss', 'content': 0.021937739104032516, 'timestamp': '2025-09-15 03:16:32.571954', 'step': 670, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:16:32.600976', 'step': 670, 'epoch': 2} {'type': 'loss', 'content': 0.027854878455400467, 'timestamp': '2025-09-15 03:16:32.602817', 'step': 671, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:32.631921', 'step': 671, 'epoch': 2} {'type': 'loss', 'content': 0.022487761452794075, 'timestamp': '2025-09-15 03:16:32.655646', 'step': 672, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:32.684848', 'step': 672, 'epoch': 2} {'type': 'loss', 'content': 0.01178536843508482, 'timestamp': '2025-09-15 03:16:32.687401', 'step': 673, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:32.717121', 'step': 673, 'epoch': 2} {'type': 'loss', 'content': 0.02284214086830616, 'timestamp': '2025-09-15 03:16:32.718899', 'step': 674, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:32.747878', 'step': 674, 'epoch': 2} {'type': 'loss', 'content': 0.004044529981911182, 'timestamp': '2025-09-15 03:16:32.750692', 'step': 675, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:32.779566', 'step': 675, 'epoch': 2} {'type': 'loss', 'content': 0.028430771082639694, 'timestamp': '2025-09-15 03:16:32.802856', 'step': 676, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:32.836214', 'step': 676, 'epoch': 2} {'type': 'loss', 'content': 0.009200649335980415, 'timestamp': '2025-09-15 03:16:32.837894', 'step': 677, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:32.871695', 'step': 677, 'epoch': 2} {'type': 'loss', 'content': 0.015377013944089413, 'timestamp': '2025-09-15 03:16:32.876501', 'step': 678, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:32.906114', 'step': 678, 'epoch': 2} {'type': 'loss', 'content': 0.02052604779601097, 'timestamp': '2025-09-15 03:16:32.908188', 'step': 679, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:32.937695', 'step': 679, 'epoch': 2} {'type': 'loss', 'content': 0.015673715621232986, 'timestamp': '2025-09-15 03:16:32.961332', 'step': 680, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:32.990713', 'step': 680, 'epoch': 2} {'type': 'loss', 'content': 0.02544725500047207, 'timestamp': '2025-09-15 03:16:32.992609', 'step': 681, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:33.022312', 'step': 681, 'epoch': 2} {'type': 'loss', 'content': 0.016639648005366325, 'timestamp': '2025-09-15 03:16:33.025334', 'step': 682, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:33.054557', 'step': 682, 'epoch': 2} {'type': 'loss', 'content': 0.020371485501527786, 'timestamp': '2025-09-15 03:16:33.056748', 'step': 683, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:33.087222', 'step': 683, 'epoch': 2} {'type': 'loss', 'content': 0.012767528183758259, 'timestamp': '2025-09-15 03:16:33.111045', 'step': 684, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:33.140413', 'step': 684, 'epoch': 2} {'type': 'loss', 'content': 0.01824815571308136, 'timestamp': '2025-09-15 03:16:33.142737', 'step': 685, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:33.172116', 'step': 685, 'epoch': 2} {'type': 'loss', 'content': 0.009122505784034729, 'timestamp': '2025-09-15 03:16:33.175010', 'step': 686, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:33.204797', 'step': 686, 'epoch': 2} {'type': 'loss', 'content': 0.01623712293803692, 'timestamp': '2025-09-15 03:16:33.209842', 'step': 687, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:33.239531', 'step': 687, 'epoch': 2} {'type': 'loss', 'content': 0.01666615717113018, 'timestamp': '2025-09-15 03:16:33.265187', 'step': 688, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:33.293849', 'step': 688, 'epoch': 2} {'type': 'loss', 'content': 0.0204975213855505, 'timestamp': '2025-09-15 03:16:33.295799', 'step': 689, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:33.325298', 'step': 689, 'epoch': 2} {'type': 'loss', 'content': 0.011426101438701153, 'timestamp': '2025-09-15 03:16:33.335610', 'step': 690, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:16:33.366510', 'step': 690, 'epoch': 2} {'type': 'loss', 'content': 0.01042362954467535, 'timestamp': '2025-09-15 03:16:33.368838', 'step': 691, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:33.398837', 'step': 691, 'epoch': 2} {'type': 'loss', 'content': 0.028445009142160416, 'timestamp': '2025-09-15 03:16:33.424469', 'step': 692, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:33.454157', 'step': 692, 'epoch': 2} {'type': 'loss', 'content': 0.011215626262128353, 'timestamp': '2025-09-15 03:16:33.456148', 'step': 693, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:16:33.485775', 'step': 693, 'epoch': 2} {'type': 'loss', 'content': 0.012956084683537483, 'timestamp': '2025-09-15 03:16:33.487973', 'step': 694, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:16:33.517389', 'step': 694, 'epoch': 2} {'type': 'loss', 'content': 0.012555070221424103, 'timestamp': '2025-09-15 03:16:33.519670', 'step': 695, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:16:33.549477', 'step': 695, 'epoch': 2} {'type': 'loss', 'content': 0.012987296096980572, 'timestamp': '2025-09-15 03:16:33.572674', 'step': 696, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:33.601684', 'step': 696, 'epoch': 2} {'type': 'loss', 'content': 0.009055888280272484, 'timestamp': '2025-09-15 03:16:33.603746', 'step': 697, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:33.633132', 'step': 697, 'epoch': 2} {'type': 'loss', 'content': 0.003815798321738839, 'timestamp': '2025-09-15 03:16:33.634950', 'step': 698, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:33.664742', 'step': 698, 'epoch': 2} {'type': 'loss', 'content': 0.004667258355766535, 'timestamp': '2025-09-15 03:16:33.669761', 'step': 699, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:33.698956', 'step': 699, 'epoch': 2} {'type': 'loss', 'content': 0.017840759828686714, 'timestamp': '2025-09-15 03:16:33.723232', 'step': 700, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:33.752626', 'step': 700, 'epoch': 2} {'type': 'loss', 'content': 0.015343495644629002, 'timestamp': '2025-09-15 03:16:33.757678', 'step': 701, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:33.788145', 'step': 701, 'epoch': 2} {'type': 'loss', 'content': 0.020067986100912094, 'timestamp': '2025-09-15 03:16:33.796127', 'step': 702, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:16:34.379492', 'step': 702, 'epoch': 2} {'type': 'pplx', 'content': 77832797.15443674, 'timestamp': '2025-09-15 03:16:34.381487', 'step': 702, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:34.409799', 'step': 702, 'epoch': 2} {'type': 'loss', 'content': 0.006786287762224674, 'timestamp': '2025-09-15 03:16:34.414257', 'step': 703, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:34.443684', 'step': 703, 'epoch': 2} {'type': 'loss', 'content': 0.007841740734875202, 'timestamp': '2025-09-15 03:16:34.471957', 'step': 704, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:34.501439', 'step': 704, 'epoch': 2} {'type': 'loss', 'content': 0.006768788676708937, 'timestamp': '2025-09-15 03:16:34.504126', 'step': 705, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:34.534114', 'step': 705, 'epoch': 2} {'type': 'loss', 'content': 0.013998471200466156, 'timestamp': '2025-09-15 03:16:34.542119', 'step': 706, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:34.572814', 'step': 706, 'epoch': 2} {'type': 'loss', 'content': 0.016503024846315384, 'timestamp': '2025-09-15 03:16:34.577344', 'step': 707, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:34.607323', 'step': 707, 'epoch': 2} {'type': 'loss', 'content': 0.0034505471121519804, 'timestamp': '2025-09-15 03:16:34.635888', 'step': 708, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:34.664818', 'step': 708, 'epoch': 2} {'type': 'loss', 'content': 0.005293605383485556, 'timestamp': '2025-09-15 03:16:34.666822', 'step': 709, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:34.696327', 'step': 709, 'epoch': 2} {'type': 'loss', 'content': 0.01751369796693325, 'timestamp': '2025-09-15 03:16:34.704308', 'step': 710, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:34.733767', 'step': 710, 'epoch': 2} {'type': 'loss', 'content': 0.0058233654126524925, 'timestamp': '2025-09-15 03:16:34.735421', 'step': 711, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:34.765340', 'step': 711, 'epoch': 2} {'type': 'loss', 'content': 0.01761193387210369, 'timestamp': '2025-09-15 03:16:34.793544', 'step': 712, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:34.822857', 'step': 712, 'epoch': 2} {'type': 'loss', 'content': 0.008471434004604816, 'timestamp': '2025-09-15 03:16:34.826186', 'step': 713, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:34.856873', 'step': 713, 'epoch': 2} {'type': 'loss', 'content': 0.010182567872107029, 'timestamp': '2025-09-15 03:16:34.860710', 'step': 714, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:34.893589', 'step': 714, 'epoch': 2} {'type': 'loss', 'content': 0.010621651075780392, 'timestamp': '2025-09-15 03:16:34.898591', 'step': 715, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:34.928577', 'step': 715, 'epoch': 2} {'type': 'loss', 'content': 0.026854127645492554, 'timestamp': '2025-09-15 03:16:34.954121', 'step': 716, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:34.984982', 'step': 716, 'epoch': 2} {'type': 'loss', 'content': 0.014122831635177135, 'timestamp': '2025-09-15 03:16:34.987777', 'step': 717, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:35.017197', 'step': 717, 'epoch': 2} {'type': 'loss', 'content': 0.006635740399360657, 'timestamp': '2025-09-15 03:16:35.022246', 'step': 718, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:35.052408', 'step': 718, 'epoch': 2} {'type': 'loss', 'content': 0.025449639186263084, 'timestamp': '2025-09-15 03:16:35.057072', 'step': 719, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:35.087518', 'step': 719, 'epoch': 2} {'type': 'loss', 'content': 0.012080073356628418, 'timestamp': '2025-09-15 03:16:35.110634', 'step': 720, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:35.138838', 'step': 720, 'epoch': 2} {'type': 'loss', 'content': 0.025123355910182, 'timestamp': '2025-09-15 03:16:35.140986', 'step': 721, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:35.170557', 'step': 721, 'epoch': 2} {'type': 'loss', 'content': 0.0044836062006652355, 'timestamp': '2025-09-15 03:16:35.178670', 'step': 722, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:35.208096', 'step': 722, 'epoch': 2} {'type': 'loss', 'content': 0.0019570142030715942, 'timestamp': '2025-09-15 03:16:35.211255', 'step': 723, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:35.240586', 'step': 723, 'epoch': 2} {'type': 'loss', 'content': 0.005114336032420397, 'timestamp': '2025-09-15 03:16:35.268864', 'step': 724, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:35.298475', 'step': 724, 'epoch': 2} {'type': 'loss', 'content': 0.0038301029708236456, 'timestamp': '2025-09-15 03:16:35.301247', 'step': 725, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:35.330890', 'step': 725, 'epoch': 2} {'type': 'loss', 'content': 0.01759827882051468, 'timestamp': '2025-09-15 03:16:35.334012', 'step': 726, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:35.363990', 'step': 726, 'epoch': 2} {'type': 'loss', 'content': 0.012181101366877556, 'timestamp': '2025-09-15 03:16:35.369008', 'step': 727, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-15 03:16:35.400083', 'step': 727, 'epoch': 2} {'type': 'loss', 'content': 0.004602841567248106, 'timestamp': '2025-09-15 03:16:35.429385', 'step': 728, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:16:35.458710', 'step': 728, 'epoch': 2} {'type': 'loss', 'content': 0.00435149110853672, 'timestamp': '2025-09-15 03:16:35.460902', 'step': 729, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:35.492071', 'step': 729, 'epoch': 2} {'type': 'loss', 'content': 0.006036927457898855, 'timestamp': '2025-09-15 03:16:35.496762', 'step': 730, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:35.525715', 'step': 730, 'epoch': 2} {'type': 'loss', 'content': 0.005526290275156498, 'timestamp': '2025-09-15 03:16:35.527628', 'step': 731, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:35.556538', 'step': 731, 'epoch': 2} {'type': 'loss', 'content': 0.018071789294481277, 'timestamp': '2025-09-15 03:16:35.579581', 'step': 732, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:35.608987', 'step': 732, 'epoch': 2} {'type': 'loss', 'content': 0.012418746016919613, 'timestamp': '2025-09-15 03:16:35.611290', 'step': 733, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:35.640967', 'step': 733, 'epoch': 2} {'type': 'loss', 'content': 0.017110273241996765, 'timestamp': '2025-09-15 03:16:35.646041', 'step': 734, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:35.676386', 'step': 734, 'epoch': 2} {'type': 'loss', 'content': 0.0071751694194972515, 'timestamp': '2025-09-15 03:16:35.678099', 'step': 735, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:35.707546', 'step': 735, 'epoch': 2} {'type': 'loss', 'content': 0.008484402671456337, 'timestamp': '2025-09-15 03:16:35.733162', 'step': 736, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:35.763879', 'step': 736, 'epoch': 2} {'type': 'loss', 'content': 0.018988804891705513, 'timestamp': '2025-09-15 03:16:35.769527', 'step': 737, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:35.799353', 'step': 737, 'epoch': 2} {'type': 'loss', 'content': 0.013006098568439484, 'timestamp': '2025-09-15 03:16:35.801249', 'step': 738, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:35.831004', 'step': 738, 'epoch': 2} {'type': 'loss', 'content': 0.005592212080955505, 'timestamp': '2025-09-15 03:16:35.836027', 'step': 739, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:16:35.865638', 'step': 739, 'epoch': 2} {'type': 'loss', 'content': 0.0024918068666011095, 'timestamp': '2025-09-15 03:16:35.890307', 'step': 740, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:35.920453', 'step': 740, 'epoch': 2} {'type': 'loss', 'content': 0.01893484592437744, 'timestamp': '2025-09-15 03:16:35.925465', 'step': 741, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:16:36.509456', 'step': 741, 'epoch': 2} {'type': 'pplx', 'content': 91392949.23677413, 'timestamp': '2025-09-15 03:16:36.511155', 'step': 741, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:36.539681', 'step': 741, 'epoch': 2} {'type': 'loss', 'content': 0.006453401874750853, 'timestamp': '2025-09-15 03:16:36.543957', 'step': 742, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-15 03:16:36.573874', 'step': 742, 'epoch': 2} {'type': 'loss', 'content': 0.011112798936665058, 'timestamp': '2025-09-15 03:16:36.584820', 'step': 743, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:36.615801', 'step': 743, 'epoch': 2} {'type': 'loss', 'content': 0.004536872263997793, 'timestamp': '2025-09-15 03:16:36.641339', 'step': 744, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:36.671119', 'step': 744, 'epoch': 2} {'type': 'loss', 'content': 0.0465034581720829, 'timestamp': '2025-09-15 03:16:36.673193', 'step': 745, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:36.702738', 'step': 745, 'epoch': 2} {'type': 'loss', 'content': 0.03564067929983139, 'timestamp': '2025-09-15 03:16:36.707647', 'step': 746, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:36.737894', 'step': 746, 'epoch': 2} {'type': 'loss', 'content': 0.01765831746160984, 'timestamp': '2025-09-15 03:16:36.744866', 'step': 747, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:36.774711', 'step': 747, 'epoch': 2} {'type': 'loss', 'content': 0.017794979736208916, 'timestamp': '2025-09-15 03:16:36.798376', 'step': 748, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:36.828621', 'step': 748, 'epoch': 2} {'type': 'loss', 'content': 0.002160144504159689, 'timestamp': '2025-09-15 03:16:36.830790', 'step': 749, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:36.861174', 'step': 749, 'epoch': 2} {'type': 'loss', 'content': 0.0007071318104863167, 'timestamp': '2025-09-15 03:16:36.863311', 'step': 750, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:36.893618', 'step': 750, 'epoch': 2} {'type': 'loss', 'content': 0.0015173929277807474, 'timestamp': '2025-09-15 03:16:36.895853', 'step': 751, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:36.926094', 'step': 751, 'epoch': 2} {'type': 'loss', 'content': 0.005006411578506231, 'timestamp': '2025-09-15 03:16:36.951636', 'step': 752, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-15 03:16:36.981783', 'step': 752, 'epoch': 2} {'type': 'loss', 'content': 0.008088169619441032, 'timestamp': '2025-09-15 03:16:36.989623', 'step': 753, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:37.020847', 'step': 753, 'epoch': 2} {'type': 'loss', 'content': 0.0008037859806790948, 'timestamp': '2025-09-15 03:16:37.024945', 'step': 754, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:16:37.054851', 'step': 754, 'epoch': 2} {'type': 'loss', 'content': 0.001742966240271926, 'timestamp': '2025-09-15 03:16:37.056939', 'step': 755, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:37.087156', 'step': 755, 'epoch': 2} {'type': 'loss', 'content': 0.00438849488273263, 'timestamp': '2025-09-15 03:16:37.112369', 'step': 756, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:37.142793', 'step': 756, 'epoch': 2} {'type': 'loss', 'content': 0.008937621489167213, 'timestamp': '2025-09-15 03:16:37.147315', 'step': 757, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:37.177595', 'step': 757, 'epoch': 2} {'type': 'loss', 'content': 0.05840831249952316, 'timestamp': '2025-09-15 03:16:37.185106', 'step': 758, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:37.215954', 'step': 758, 'epoch': 2} {'type': 'loss', 'content': 0.0005601375596597791, 'timestamp': '2025-09-15 03:16:37.220617', 'step': 759, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:16:37.250156', 'step': 759, 'epoch': 2} {'type': 'loss', 'content': 0.011043830774724483, 'timestamp': '2025-09-15 03:16:37.273773', 'step': 760, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:37.303774', 'step': 760, 'epoch': 2} {'type': 'loss', 'content': 0.0023872568272054195, 'timestamp': '2025-09-15 03:16:37.305986', 'step': 761, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:37.336645', 'step': 761, 'epoch': 2} {'type': 'loss', 'content': 0.06608807295560837, 'timestamp': '2025-09-15 03:16:37.341189', 'step': 762, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:37.370848', 'step': 762, 'epoch': 2} {'type': 'loss', 'content': 0.0007567157153971493, 'timestamp': '2025-09-15 03:16:37.373388', 'step': 763, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:37.403097', 'step': 763, 'epoch': 2} {'type': 'loss', 'content': 0.0014279234455898404, 'timestamp': '2025-09-15 03:16:37.426664', 'step': 764, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-15 03:16:37.458231', 'step': 764, 'epoch': 2} {'type': 'loss', 'content': 0.01821976900100708, 'timestamp': '2025-09-15 03:16:37.463548', 'step': 765, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:37.493815', 'step': 765, 'epoch': 2} {'type': 'loss', 'content': 0.0018954897532239556, 'timestamp': '2025-09-15 03:16:37.495984', 'step': 766, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:37.527727', 'step': 766, 'epoch': 2} {'type': 'loss', 'content': 0.0035364211071282625, 'timestamp': '2025-09-15 03:16:37.529820', 'step': 767, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:37.559701', 'step': 767, 'epoch': 2} {'type': 'loss', 'content': 0.028634265065193176, 'timestamp': '2025-09-15 03:16:37.583279', 'step': 768, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:37.612862', 'step': 768, 'epoch': 2} {'type': 'loss', 'content': 0.011222733184695244, 'timestamp': '2025-09-15 03:16:37.614870', 'step': 769, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:37.645347', 'step': 769, 'epoch': 2} {'type': 'loss', 'content': 0.004136077128350735, 'timestamp': '2025-09-15 03:16:37.647414', 'step': 770, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:37.677605', 'step': 770, 'epoch': 2} {'type': 'loss', 'content': 0.005994807463139296, 'timestamp': '2025-09-15 03:16:37.681907', 'step': 771, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:37.712243', 'step': 771, 'epoch': 2} {'type': 'loss', 'content': 0.014044421724975109, 'timestamp': '2025-09-15 03:16:37.737562', 'step': 772, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:37.767627', 'step': 772, 'epoch': 2} {'type': 'loss', 'content': 0.044583700597286224, 'timestamp': '2025-09-15 03:16:37.769968', 'step': 773, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:37.799796', 'step': 773, 'epoch': 2} {'type': 'loss', 'content': 0.0017988347681239247, 'timestamp': '2025-09-15 03:16:37.801741', 'step': 774, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:37.833531', 'step': 774, 'epoch': 2} {'type': 'loss', 'content': 0.02564374729990959, 'timestamp': '2025-09-15 03:16:37.835743', 'step': 775, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:37.865814', 'step': 775, 'epoch': 2} {'type': 'loss', 'content': 0.01314005721360445, 'timestamp': '2025-09-15 03:16:37.889695', 'step': 776, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:37.920049', 'step': 776, 'epoch': 2} {'type': 'loss', 'content': 0.0391593798995018, 'timestamp': '2025-09-15 03:16:37.922170', 'step': 777, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:37.951649', 'step': 777, 'epoch': 2} {'type': 'loss', 'content': 0.008872945792973042, 'timestamp': '2025-09-15 03:16:37.956011', 'step': 778, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:37.986077', 'step': 778, 'epoch': 2} {'type': 'loss', 'content': 0.009254111908376217, 'timestamp': '2025-09-15 03:16:37.993209', 'step': 779, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:16:38.024701', 'step': 779, 'epoch': 2} {'type': 'loss', 'content': 0.001388642704114318, 'timestamp': '2025-09-15 03:16:38.048402', 'step': 780, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:16:38.637769', 'step': 780, 'epoch': 2} {'type': 'pplx', 'content': 91475636.10912232, 'timestamp': '2025-09-15 03:16:38.639760', 'step': 780, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:38.668757', 'step': 780, 'epoch': 2} {'type': 'loss', 'content': 0.05400308594107628, 'timestamp': '2025-09-15 03:16:38.670970', 'step': 781, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:38.700864', 'step': 781, 'epoch': 2} {'type': 'loss', 'content': 0.029076367616653442, 'timestamp': '2025-09-15 03:16:38.704716', 'step': 782, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:38.735483', 'step': 782, 'epoch': 2} {'type': 'loss', 'content': 0.015219086781144142, 'timestamp': '2025-09-15 03:16:38.742725', 'step': 783, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:38.772989', 'step': 783, 'epoch': 2} {'type': 'loss', 'content': 0.01549097616225481, 'timestamp': '2025-09-15 03:16:38.798231', 'step': 784, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:38.828882', 'step': 784, 'epoch': 2} {'type': 'loss', 'content': 0.006616808939725161, 'timestamp': '2025-09-15 03:16:38.830975', 'step': 785, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:38.861612', 'step': 785, 'epoch': 2} {'type': 'loss', 'content': 0.02561446838080883, 'timestamp': '2025-09-15 03:16:38.869180', 'step': 786, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:38.899001', 'step': 786, 'epoch': 2} {'type': 'loss', 'content': 0.002184271113947034, 'timestamp': '2025-09-15 03:16:38.903827', 'step': 787, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:38.936189', 'step': 787, 'epoch': 2} {'type': 'loss', 'content': 0.004748955834656954, 'timestamp': '2025-09-15 03:16:38.959872', 'step': 788, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:38.990262', 'step': 788, 'epoch': 2} {'type': 'loss', 'content': 0.015393979847431183, 'timestamp': '2025-09-15 03:16:38.992405', 'step': 789, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:39.022756', 'step': 789, 'epoch': 2} {'type': 'loss', 'content': 0.003272005822509527, 'timestamp': '2025-09-15 03:16:39.027048', 'step': 790, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:39.057665', 'step': 790, 'epoch': 2} {'type': 'loss', 'content': 0.0167753454297781, 'timestamp': '2025-09-15 03:16:39.060351', 'step': 791, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:39.091407', 'step': 791, 'epoch': 2} {'type': 'loss', 'content': 0.004095498938113451, 'timestamp': '2025-09-15 03:16:39.120074', 'step': 792, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:39.151834', 'step': 792, 'epoch': 2} {'type': 'loss', 'content': 0.007157167885452509, 'timestamp': '2025-09-15 03:16:39.154300', 'step': 793, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:39.185580', 'step': 793, 'epoch': 2} {'type': 'loss', 'content': 0.02066117525100708, 'timestamp': '2025-09-15 03:16:39.187572', 'step': 794, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:39.218973', 'step': 794, 'epoch': 2} {'type': 'loss', 'content': 0.011057278141379356, 'timestamp': '2025-09-15 03:16:39.226137', 'step': 795, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:39.256789', 'step': 795, 'epoch': 2} {'type': 'loss', 'content': 0.017794836312532425, 'timestamp': '2025-09-15 03:16:39.282076', 'step': 796, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:39.311934', 'step': 796, 'epoch': 2} {'type': 'loss', 'content': 0.044305894523859024, 'timestamp': '2025-09-15 03:16:39.314661', 'step': 797, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:39.345231', 'step': 797, 'epoch': 2} {'type': 'loss', 'content': 0.010922898538410664, 'timestamp': '2025-09-15 03:16:39.349935', 'step': 798, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:39.380088', 'step': 798, 'epoch': 2} {'type': 'loss', 'content': 0.0023753507994115353, 'timestamp': '2025-09-15 03:16:39.382828', 'step': 799, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:16:39.412330', 'step': 799, 'epoch': 2} {'type': 'loss', 'content': 0.01779024675488472, 'timestamp': '2025-09-15 03:16:39.435893', 'step': 800, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:39.466282', 'step': 800, 'epoch': 2} {'type': 'loss', 'content': 0.0019487850368022919, 'timestamp': '2025-09-15 03:16:39.468504', 'step': 801, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:39.498708', 'step': 801, 'epoch': 2} {'type': 'loss', 'content': 0.004304631147533655, 'timestamp': '2025-09-15 03:16:39.502881', 'step': 802, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:39.532939', 'step': 802, 'epoch': 2} {'type': 'loss', 'content': 0.007857972756028175, 'timestamp': '2025-09-15 03:16:39.534946', 'step': 803, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:39.565837', 'step': 803, 'epoch': 2} {'type': 'loss', 'content': 0.020163623616099358, 'timestamp': '2025-09-15 03:16:39.590948', 'step': 804, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:39.624856', 'step': 804, 'epoch': 2} {'type': 'loss', 'content': 0.0343315526843071, 'timestamp': '2025-09-15 03:16:39.630262', 'step': 805, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:39.663557', 'step': 805, 'epoch': 2} {'type': 'loss', 'content': 0.0051241847686469555, 'timestamp': '2025-09-15 03:16:39.665993', 'step': 806, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:39.696291', 'step': 806, 'epoch': 2} {'type': 'loss', 'content': 0.006621518172323704, 'timestamp': '2025-09-15 03:16:39.699258', 'step': 807, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:39.728546', 'step': 807, 'epoch': 2} {'type': 'loss', 'content': 0.00918488297611475, 'timestamp': '2025-09-15 03:16:39.752161', 'step': 808, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:39.782176', 'step': 808, 'epoch': 2} {'type': 'loss', 'content': 0.0167904794216156, 'timestamp': '2025-09-15 03:16:39.784151', 'step': 809, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:39.813915', 'step': 809, 'epoch': 2} {'type': 'loss', 'content': 0.012763723731040955, 'timestamp': '2025-09-15 03:16:39.816189', 'step': 810, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:39.848653', 'step': 810, 'epoch': 2} {'type': 'loss', 'content': 0.006828667130321264, 'timestamp': '2025-09-15 03:16:39.855543', 'step': 811, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:39.885412', 'step': 811, 'epoch': 2} {'type': 'loss', 'content': 0.007466200739145279, 'timestamp': '2025-09-15 03:16:39.910767', 'step': 812, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:39.940314', 'step': 812, 'epoch': 2} {'type': 'loss', 'content': 0.006770141888409853, 'timestamp': '2025-09-15 03:16:39.942277', 'step': 813, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:39.973699', 'step': 813, 'epoch': 2} {'type': 'loss', 'content': 0.02318437024950981, 'timestamp': '2025-09-15 03:16:39.975640', 'step': 814, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:40.005672', 'step': 814, 'epoch': 2} {'type': 'loss', 'content': 0.019127046689391136, 'timestamp': '2025-09-15 03:16:40.007885', 'step': 815, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:40.038004', 'step': 815, 'epoch': 2} {'type': 'loss', 'content': 0.005318149924278259, 'timestamp': '2025-09-15 03:16:40.061953', 'step': 816, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:40.092305', 'step': 816, 'epoch': 2} {'type': 'loss', 'content': 0.01984100416302681, 'timestamp': '2025-09-15 03:16:40.094341', 'step': 817, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:40.124134', 'step': 817, 'epoch': 2} {'type': 'loss', 'content': 0.0027165242936462164, 'timestamp': '2025-09-15 03:16:40.126186', 'step': 818, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:40.156080', 'step': 818, 'epoch': 2} {'type': 'loss', 'content': 0.01571919396519661, 'timestamp': '2025-09-15 03:16:40.160558', 'step': 819, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:16:40.747069', 'step': 819, 'epoch': 2} {'type': 'pplx', 'content': 84134180.46701647, 'timestamp': '2025-09-15 03:16:40.748913', 'step': 819, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:40.777001', 'step': 819, 'epoch': 2} {'type': 'loss', 'content': 0.010039118118584156, 'timestamp': '2025-09-15 03:16:40.802576', 'step': 820, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:40.832763', 'step': 820, 'epoch': 2} {'type': 'loss', 'content': 0.0398661270737648, 'timestamp': '2025-09-15 03:16:40.835113', 'step': 821, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:40.865732', 'step': 821, 'epoch': 2} {'type': 'loss', 'content': 0.0018895015818998218, 'timestamp': '2025-09-15 03:16:40.872712', 'step': 822, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:40.903265', 'step': 822, 'epoch': 2} {'type': 'loss', 'content': 0.0020522272679954767, 'timestamp': '2025-09-15 03:16:40.910259', 'step': 823, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:40.941710', 'step': 823, 'epoch': 2} {'type': 'loss', 'content': 0.012462636455893517, 'timestamp': '2025-09-15 03:16:40.969691', 'step': 824, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:41.000101', 'step': 824, 'epoch': 2} {'type': 'loss', 'content': 0.006202584598213434, 'timestamp': '2025-09-15 03:16:41.002186', 'step': 825, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:41.032496', 'step': 825, 'epoch': 2} {'type': 'loss', 'content': 0.011295338161289692, 'timestamp': '2025-09-15 03:16:41.034785', 'step': 826, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:41.065068', 'step': 826, 'epoch': 2} {'type': 'loss', 'content': 0.017415456473827362, 'timestamp': '2025-09-15 03:16:41.067053', 'step': 827, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:41.097289', 'step': 827, 'epoch': 2} {'type': 'loss', 'content': 0.010938125662505627, 'timestamp': '2025-09-15 03:16:41.121007', 'step': 828, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:41.151259', 'step': 828, 'epoch': 2} {'type': 'loss', 'content': 0.014517457224428654, 'timestamp': '2025-09-15 03:16:41.153430', 'step': 829, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:16:41.183394', 'step': 829, 'epoch': 2} {'type': 'loss', 'content': 0.03378923982381821, 'timestamp': '2025-09-15 03:16:41.185440', 'step': 830, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:41.216218', 'step': 830, 'epoch': 2} {'type': 'loss', 'content': 0.01933034136891365, 'timestamp': '2025-09-15 03:16:41.223339', 'step': 831, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:41.254027', 'step': 831, 'epoch': 2} {'type': 'loss', 'content': 0.020713625475764275, 'timestamp': '2025-09-15 03:16:41.280342', 'step': 832, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:41.313891', 'step': 832, 'epoch': 2} {'type': 'loss', 'content': 0.018430788069963455, 'timestamp': '2025-09-15 03:16:41.316133', 'step': 833, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:41.347729', 'step': 833, 'epoch': 2} {'type': 'loss', 'content': 0.018924186006188393, 'timestamp': '2025-09-15 03:16:41.354328', 'step': 834, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:41.384082', 'step': 834, 'epoch': 2} {'type': 'loss', 'content': 0.0052634854800999165, 'timestamp': '2025-09-15 03:16:41.386825', 'step': 835, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:16:41.418766', 'step': 835, 'epoch': 2} {'type': 'loss', 'content': 0.009086108766496181, 'timestamp': '2025-09-15 03:16:41.442440', 'step': 836, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:41.473513', 'step': 836, 'epoch': 2} {'type': 'loss', 'content': 0.020214611664414406, 'timestamp': '2025-09-15 03:16:41.475468', 'step': 837, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:41.505621', 'step': 837, 'epoch': 2} {'type': 'loss', 'content': 0.008888529613614082, 'timestamp': '2025-09-15 03:16:41.507402', 'step': 838, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:41.537502', 'step': 838, 'epoch': 2} {'type': 'loss', 'content': 0.0037065569777041674, 'timestamp': '2025-09-15 03:16:41.540440', 'step': 839, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:16:41.570216', 'step': 839, 'epoch': 2} {'type': 'loss', 'content': 0.0057419161312282085, 'timestamp': '2025-09-15 03:16:41.593850', 'step': 840, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-15 03:16:41.625229', 'step': 840, 'epoch': 2} {'type': 'loss', 'content': 0.0032529861200600863, 'timestamp': '2025-09-15 03:16:41.630766', 'step': 841, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:41.662044', 'step': 841, 'epoch': 2} {'type': 'loss', 'content': 0.00431677233427763, 'timestamp': '2025-09-15 03:16:41.666374', 'step': 842, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-15 03:16:41.697554', 'step': 842, 'epoch': 2} {'type': 'loss', 'content': 0.014750749804079533, 'timestamp': '2025-09-15 03:16:41.705250', 'step': 843, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:41.736256', 'step': 843, 'epoch': 2} {'type': 'loss', 'content': 0.01972171850502491, 'timestamp': '2025-09-15 03:16:41.761863', 'step': 844, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-15 03:16:41.793333', 'step': 844, 'epoch': 2} {'type': 'loss', 'content': 0.014479391276836395, 'timestamp': '2025-09-15 03:16:41.801125', 'step': 845, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:41.832344', 'step': 845, 'epoch': 2} {'type': 'loss', 'content': 0.01149661373347044, 'timestamp': '2025-09-15 03:16:41.839362', 'step': 846, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:41.869753', 'step': 846, 'epoch': 2} {'type': 'loss', 'content': 0.014005172066390514, 'timestamp': '2025-09-15 03:16:41.876593', 'step': 847, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:41.906550', 'step': 847, 'epoch': 2} {'type': 'loss', 'content': 0.027859285473823547, 'timestamp': '2025-09-15 03:16:41.930242', 'step': 848, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:41.962208', 'step': 848, 'epoch': 2} {'type': 'loss', 'content': 0.002458788687363267, 'timestamp': '2025-09-15 03:16:41.964298', 'step': 849, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:41.995211', 'step': 849, 'epoch': 2} {'type': 'loss', 'content': 0.03137155622243881, 'timestamp': '2025-09-15 03:16:41.999822', 'step': 850, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:42.030035', 'step': 850, 'epoch': 2} {'type': 'loss', 'content': 0.02159748785197735, 'timestamp': '2025-09-15 03:16:42.032302', 'step': 851, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:42.062775', 'step': 851, 'epoch': 2} {'type': 'loss', 'content': 0.0020572689827531576, 'timestamp': '2025-09-15 03:16:42.090949', 'step': 852, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:42.121291', 'step': 852, 'epoch': 2} {'type': 'loss', 'content': 0.012123345397412777, 'timestamp': '2025-09-15 03:16:42.123461', 'step': 853, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:42.153335', 'step': 853, 'epoch': 2} {'type': 'loss', 'content': 0.0106295645236969, 'timestamp': '2025-09-15 03:16:42.155664', 'step': 854, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:42.186675', 'step': 854, 'epoch': 2} {'type': 'loss', 'content': 0.003279120195657015, 'timestamp': '2025-09-15 03:16:42.194210', 'step': 855, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:42.224629', 'step': 855, 'epoch': 2} {'type': 'loss', 'content': 0.006891821976751089, 'timestamp': '2025-09-15 03:16:42.253404', 'step': 856, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:42.283132', 'step': 856, 'epoch': 2} {'type': 'loss', 'content': 0.02520199678838253, 'timestamp': '2025-09-15 03:16:42.285755', 'step': 857, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:42.316268', 'step': 857, 'epoch': 2} {'type': 'loss', 'content': 0.03084452822804451, 'timestamp': '2025-09-15 03:16:42.318793', 'step': 858, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:16:42.916288', 'step': 858, 'epoch': 2} {'type': 'pplx', 'content': 87225457.20443432, 'timestamp': '2025-09-15 03:16:42.918481', 'step': 858, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-15 03:16:42.947505', 'step': 858, 'epoch': 2} {'type': 'loss', 'content': 0.004284991882741451, 'timestamp': '2025-09-15 03:16:42.955023', 'step': 859, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:42.986798', 'step': 859, 'epoch': 2} {'type': 'loss', 'content': 0.03205358237028122, 'timestamp': '2025-09-15 03:16:43.012665', 'step': 860, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:43.042712', 'step': 860, 'epoch': 2} {'type': 'loss', 'content': 0.016096081584692, 'timestamp': '2025-09-15 03:16:43.045202', 'step': 861, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:43.075796', 'step': 861, 'epoch': 2} {'type': 'loss', 'content': 0.014634288847446442, 'timestamp': '2025-09-15 03:16:43.078808', 'step': 862, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:43.108878', 'step': 862, 'epoch': 2} {'type': 'loss', 'content': 0.016217602416872978, 'timestamp': '2025-09-15 03:16:43.113669', 'step': 863, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:43.144194', 'step': 863, 'epoch': 2} {'type': 'loss', 'content': 0.008187689818441868, 'timestamp': '2025-09-15 03:16:43.172385', 'step': 864, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:43.202443', 'step': 864, 'epoch': 2} {'type': 'loss', 'content': 0.005138729233294725, 'timestamp': '2025-09-15 03:16:43.204527', 'step': 865, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:43.234718', 'step': 865, 'epoch': 2} {'type': 'loss', 'content': 0.018294647336006165, 'timestamp': '2025-09-15 03:16:43.241946', 'step': 866, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:43.272979', 'step': 866, 'epoch': 2} {'type': 'loss', 'content': 0.010565018281340599, 'timestamp': '2025-09-15 03:16:43.277628', 'step': 867, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:43.307641', 'step': 867, 'epoch': 2} {'type': 'loss', 'content': 0.004669521003961563, 'timestamp': '2025-09-15 03:16:43.335792', 'step': 868, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:43.366278', 'step': 868, 'epoch': 2} {'type': 'loss', 'content': 0.005344860255718231, 'timestamp': '2025-09-15 03:16:43.371627', 'step': 869, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:43.401841', 'step': 869, 'epoch': 2} {'type': 'loss', 'content': 0.015962885692715645, 'timestamp': '2025-09-15 03:16:43.406640', 'step': 870, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:43.436453', 'step': 870, 'epoch': 2} {'type': 'loss', 'content': 0.008724975399672985, 'timestamp': '2025-09-15 03:16:43.440979', 'step': 871, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:43.472403', 'step': 871, 'epoch': 2} {'type': 'loss', 'content': 0.0020475266501307487, 'timestamp': '2025-09-15 03:16:43.500064', 'step': 872, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:43.530208', 'step': 872, 'epoch': 2} {'type': 'loss', 'content': 0.00470705796033144, 'timestamp': '2025-09-15 03:16:43.532249', 'step': 873, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:43.562594', 'step': 873, 'epoch': 2} {'type': 'loss', 'content': 0.005053224973380566, 'timestamp': '2025-09-15 03:16:43.566313', 'step': 874, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:43.596323', 'step': 874, 'epoch': 2} {'type': 'loss', 'content': 0.009020118974149227, 'timestamp': '2025-09-15 03:16:43.598669', 'step': 875, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:43.629073', 'step': 875, 'epoch': 2} {'type': 'loss', 'content': 0.014806902967393398, 'timestamp': '2025-09-15 03:16:43.652929', 'step': 876, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:43.682761', 'step': 876, 'epoch': 2} {'type': 'loss', 'content': 0.021488988772034645, 'timestamp': '2025-09-15 03:16:43.685056', 'step': 877, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:43.714718', 'step': 877, 'epoch': 2} {'type': 'loss', 'content': 0.018449898809194565, 'timestamp': '2025-09-15 03:16:43.716963', 'step': 878, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:43.746605', 'step': 878, 'epoch': 2} {'type': 'loss', 'content': 0.0025834415573626757, 'timestamp': '2025-09-15 03:16:43.748886', 'step': 879, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:43.779851', 'step': 879, 'epoch': 2} {'type': 'loss', 'content': 0.0028492652345448732, 'timestamp': '2025-09-15 03:16:43.808017', 'step': 880, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-15 03:16:43.839151', 'step': 880, 'epoch': 2} {'type': 'loss', 'content': 0.004528566263616085, 'timestamp': '2025-09-15 03:16:43.844761', 'step': 881, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:43.874807', 'step': 881, 'epoch': 2} {'type': 'loss', 'content': 0.004636577796190977, 'timestamp': '2025-09-15 03:16:43.876928', 'step': 882, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:43.907477', 'step': 882, 'epoch': 2} {'type': 'loss', 'content': 0.0033082941081374884, 'timestamp': '2025-09-15 03:16:43.914552', 'step': 883, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:43.944312', 'step': 883, 'epoch': 2} {'type': 'loss', 'content': 0.005281738005578518, 'timestamp': '2025-09-15 03:16:43.968222', 'step': 884, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:43.999714', 'step': 884, 'epoch': 2} {'type': 'loss', 'content': 0.004062555264681578, 'timestamp': '2025-09-15 03:16:44.002174', 'step': 885, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:44.032133', 'step': 885, 'epoch': 2} {'type': 'loss', 'content': 0.007704237941652536, 'timestamp': '2025-09-15 03:16:44.034499', 'step': 886, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:44.063953', 'step': 886, 'epoch': 2} {'type': 'loss', 'content': 0.025397544726729393, 'timestamp': '2025-09-15 03:16:44.066115', 'step': 887, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:44.096062', 'step': 887, 'epoch': 2} {'type': 'loss', 'content': 0.00595526909455657, 'timestamp': '2025-09-15 03:16:44.119988', 'step': 888, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:44.149986', 'step': 888, 'epoch': 2} {'type': 'loss', 'content': 0.013230021111667156, 'timestamp': '2025-09-15 03:16:44.152288', 'step': 889, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:44.182439', 'step': 889, 'epoch': 2} {'type': 'loss', 'content': 0.002272603800520301, 'timestamp': '2025-09-15 03:16:44.186721', 'step': 890, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:44.217183', 'step': 890, 'epoch': 2} {'type': 'loss', 'content': 0.005084930453449488, 'timestamp': '2025-09-15 03:16:44.219817', 'step': 891, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:44.249994', 'step': 891, 'epoch': 2} {'type': 'loss', 'content': 0.013971127569675446, 'timestamp': '2025-09-15 03:16:44.275538', 'step': 892, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:44.305812', 'step': 892, 'epoch': 2} {'type': 'loss', 'content': 0.001800144906155765, 'timestamp': '2025-09-15 03:16:44.307843', 'step': 893, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:44.337821', 'step': 893, 'epoch': 2} {'type': 'loss', 'content': 0.005557761527597904, 'timestamp': '2025-09-15 03:16:44.342537', 'step': 894, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:44.373589', 'step': 894, 'epoch': 2} {'type': 'loss', 'content': 0.014650067314505577, 'timestamp': '2025-09-15 03:16:44.380529', 'step': 895, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:44.411601', 'step': 895, 'epoch': 2} {'type': 'loss', 'content': 0.001939317211508751, 'timestamp': '2025-09-15 03:16:44.440192', 'step': 896, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:44.470858', 'step': 896, 'epoch': 2} {'type': 'loss', 'content': 0.0019788548815995455, 'timestamp': '2025-09-15 03:16:44.475506', 'step': 897, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:16:45.066861', 'step': 897, 'epoch': 2} {'type': 'pplx', 'content': 97664972.25220236, 'timestamp': '2025-09-15 03:16:45.068990', 'step': 897, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:16:45.097613', 'step': 897, 'epoch': 2} {'type': 'loss', 'content': 0.0019209941383451223, 'timestamp': '2025-09-15 03:16:45.099566', 'step': 898, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-15 03:16:45.130259', 'step': 898, 'epoch': 2} {'type': 'loss', 'content': 0.0007233588839881122, 'timestamp': '2025-09-15 03:16:45.140677', 'step': 899, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:45.171112', 'step': 899, 'epoch': 2} {'type': 'loss', 'content': 0.0020077857188880444, 'timestamp': '2025-09-15 03:16:45.195064', 'step': 900, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:45.225585', 'step': 900, 'epoch': 2} {'type': 'loss', 'content': 0.0025911724660545588, 'timestamp': '2025-09-15 03:16:45.230893', 'step': 901, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:45.260717', 'step': 901, 'epoch': 2} {'type': 'loss', 'content': 0.004694484639912844, 'timestamp': '2025-09-15 03:16:45.265555', 'step': 902, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:45.295991', 'step': 902, 'epoch': 2} {'type': 'loss', 'content': 0.0017370609566569328, 'timestamp': '2025-09-15 03:16:45.300372', 'step': 903, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:45.330404', 'step': 903, 'epoch': 2} {'type': 'loss', 'content': 0.003571173409000039, 'timestamp': '2025-09-15 03:16:45.356017', 'step': 904, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:45.386138', 'step': 904, 'epoch': 2} {'type': 'loss', 'content': 0.004272869322448969, 'timestamp': '2025-09-15 03:16:45.389629', 'step': 905, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-15 03:16:45.419997', 'step': 905, 'epoch': 2} {'type': 'loss', 'content': 0.003742960514500737, 'timestamp': '2025-09-15 03:16:45.427854', 'step': 906, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-15 03:16:45.458372', 'step': 906, 'epoch': 2} {'type': 'loss', 'content': 0.0016657983651384711, 'timestamp': '2025-09-15 03:16:45.470813', 'step': 907, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:45.501490', 'step': 907, 'epoch': 2} {'type': 'loss', 'content': 0.0016500052297487855, 'timestamp': '2025-09-15 03:16:45.527072', 'step': 908, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:45.556842', 'step': 908, 'epoch': 2} {'type': 'loss', 'content': 0.002180821727961302, 'timestamp': '2025-09-15 03:16:45.561938', 'step': 909, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:45.592053', 'step': 909, 'epoch': 2} {'type': 'loss', 'content': 0.0025096870958805084, 'timestamp': '2025-09-15 03:16:45.599135', 'step': 910, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:45.630471', 'step': 910, 'epoch': 2} {'type': 'loss', 'content': 0.002849652897566557, 'timestamp': '2025-09-15 03:16:45.635284', 'step': 911, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:45.670223', 'step': 911, 'epoch': 2} {'type': 'loss', 'content': 0.003814171999692917, 'timestamp': '2025-09-15 03:16:45.695612', 'step': 912, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:45.725912', 'step': 912, 'epoch': 2} {'type': 'loss', 'content': 0.00793201383203268, 'timestamp': '2025-09-15 03:16:45.728924', 'step': 913, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:45.759626', 'step': 913, 'epoch': 2} {'type': 'loss', 'content': 0.0019563245587050915, 'timestamp': '2025-09-15 03:16:45.762042', 'step': 914, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:45.792523', 'step': 914, 'epoch': 2} {'type': 'loss', 'content': 0.0011040839599445462, 'timestamp': '2025-09-15 03:16:45.795234', 'step': 915, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:45.825711', 'step': 915, 'epoch': 2} {'type': 'loss', 'content': 0.0028216461651027203, 'timestamp': '2025-09-15 03:16:45.849403', 'step': 916, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:45.881013', 'step': 916, 'epoch': 2} {'type': 'loss', 'content': 0.0007526809931732714, 'timestamp': '2025-09-15 03:16:45.883687', 'step': 917, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:45.914280', 'step': 917, 'epoch': 2} {'type': 'loss', 'content': 0.0008043412235565484, 'timestamp': '2025-09-15 03:16:45.921225', 'step': 918, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:45.951448', 'step': 918, 'epoch': 2} {'type': 'loss', 'content': 0.001197122037410736, 'timestamp': '2025-09-15 03:16:45.953901', 'step': 919, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:45.984553', 'step': 919, 'epoch': 2} {'type': 'loss', 'content': 0.0014595863176509738, 'timestamp': '2025-09-15 03:16:46.008093', 'step': 920, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:46.038952', 'step': 920, 'epoch': 2} {'type': 'loss', 'content': 0.001012066495604813, 'timestamp': '2025-09-15 03:16:46.041717', 'step': 921, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:46.071851', 'step': 921, 'epoch': 2} {'type': 'loss', 'content': 0.00035511283203959465, 'timestamp': '2025-09-15 03:16:46.076148', 'step': 922, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:46.106451', 'step': 922, 'epoch': 2} {'type': 'loss', 'content': 0.000994206522591412, 'timestamp': '2025-09-15 03:16:46.113750', 'step': 923, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:46.143823', 'step': 923, 'epoch': 2} {'type': 'loss', 'content': 0.0010711466893553734, 'timestamp': '2025-09-15 03:16:46.169656', 'step': 924, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:46.199461', 'step': 924, 'epoch': 2} {'type': 'loss', 'content': 0.003031970700249076, 'timestamp': '2025-09-15 03:16:46.201635', 'step': 925, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:46.231677', 'step': 925, 'epoch': 2} {'type': 'loss', 'content': 0.0010669993935152888, 'timestamp': '2025-09-15 03:16:46.236430', 'step': 926, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:46.266552', 'step': 926, 'epoch': 2} {'type': 'loss', 'content': 0.004038740415126085, 'timestamp': '2025-09-15 03:16:46.269391', 'step': 927, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:16:46.299597', 'step': 927, 'epoch': 2} {'type': 'loss', 'content': 0.00037929200334474444, 'timestamp': '2025-09-15 03:16:46.323281', 'step': 928, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:46.353625', 'step': 928, 'epoch': 2} {'type': 'loss', 'content': 0.00020541806588880718, 'timestamp': '2025-09-15 03:16:46.358221', 'step': 929, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:46.387974', 'step': 929, 'epoch': 2} {'type': 'loss', 'content': 0.0017460313392803073, 'timestamp': '2025-09-15 03:16:46.395060', 'step': 930, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:46.425458', 'step': 930, 'epoch': 2} {'type': 'loss', 'content': 0.0005159589927643538, 'timestamp': '2025-09-15 03:16:46.432681', 'step': 931, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:46.463015', 'step': 931, 'epoch': 2} {'type': 'loss', 'content': 0.0029993560165166855, 'timestamp': '2025-09-15 03:16:46.488289', 'step': 932, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:46.518733', 'step': 932, 'epoch': 2} {'type': 'loss', 'content': 0.0013383693294599652, 'timestamp': '2025-09-15 03:16:46.523490', 'step': 933, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:46.553508', 'step': 933, 'epoch': 2} {'type': 'loss', 'content': 0.0037266218569129705, 'timestamp': '2025-09-15 03:16:46.555643', 'step': 934, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:46.586127', 'step': 934, 'epoch': 2} {'type': 'loss', 'content': 0.0014700050232931972, 'timestamp': '2025-09-15 03:16:46.588879', 'step': 935, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:46.619999', 'step': 935, 'epoch': 2} {'type': 'loss', 'content': 0.0016449715476483107, 'timestamp': '2025-09-15 03:16:46.644009', 'step': 936, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:16:47.230087', 'step': 936, 'epoch': 2} {'type': 'pplx', 'content': 117820981.15308204, 'timestamp': '2025-09-15 03:16:47.231764', 'step': 936, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:16:47.259736', 'step': 936, 'epoch': 2} {'type': 'loss', 'content': 0.00042086487519554794, 'timestamp': '2025-09-15 03:16:47.261916', 'step': 937, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:47.292556', 'step': 937, 'epoch': 2} {'type': 'loss', 'content': 0.0002839158405549824, 'timestamp': '2025-09-15 03:16:47.296766', 'step': 938, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:47.327598', 'step': 938, 'epoch': 2} {'type': 'loss', 'content': 0.00030601557227782905, 'timestamp': '2025-09-15 03:16:47.332396', 'step': 939, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:47.364109', 'step': 939, 'epoch': 2} {'type': 'loss', 'content': 0.00037333546788431704, 'timestamp': '2025-09-15 03:16:47.388182', 'step': 940, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:47.419364', 'step': 940, 'epoch': 2} {'type': 'loss', 'content': 0.0013638328528031707, 'timestamp': '2025-09-15 03:16:47.421913', 'step': 941, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:47.453328', 'step': 941, 'epoch': 2} {'type': 'loss', 'content': 0.0005319089395925403, 'timestamp': '2025-09-15 03:16:47.455415', 'step': 942, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:47.485061', 'step': 942, 'epoch': 2} {'type': 'loss', 'content': 0.00039939445559866726, 'timestamp': '2025-09-15 03:16:47.487018', 'step': 943, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:47.516953', 'step': 943, 'epoch': 2} {'type': 'loss', 'content': 0.002649063942953944, 'timestamp': '2025-09-15 03:16:47.540563', 'step': 944, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:47.571323', 'step': 944, 'epoch': 2} {'type': 'loss', 'content': 0.0003652539453469217, 'timestamp': '2025-09-15 03:16:47.576702', 'step': 945, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:47.606794', 'step': 945, 'epoch': 2} {'type': 'loss', 'content': 0.0002454536152072251, 'timestamp': '2025-09-15 03:16:47.611600', 'step': 946, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:47.641913', 'step': 946, 'epoch': 2} {'type': 'loss', 'content': 0.0004982949467375875, 'timestamp': '2025-09-15 03:16:47.646251', 'step': 947, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:47.676735', 'step': 947, 'epoch': 2} {'type': 'loss', 'content': 0.0007253338699229062, 'timestamp': '2025-09-15 03:16:47.702580', 'step': 948, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:47.732998', 'step': 948, 'epoch': 2} {'type': 'loss', 'content': 0.001324442564509809, 'timestamp': '2025-09-15 03:16:47.735102', 'step': 949, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:47.764092', 'step': 949, 'epoch': 2} {'type': 'loss', 'content': 0.0002965231833513826, 'timestamp': '2025-09-15 03:16:47.766287', 'step': 950, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:47.796296', 'step': 950, 'epoch': 2} {'type': 'loss', 'content': 0.003919391892850399, 'timestamp': '2025-09-15 03:16:47.798768', 'step': 951, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:47.829520', 'step': 951, 'epoch': 2} {'type': 'loss', 'content': 0.0001225820742547512, 'timestamp': '2025-09-15 03:16:47.858087', 'step': 952, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:47.887663', 'step': 952, 'epoch': 2} {'type': 'loss', 'content': 0.00019038261962123215, 'timestamp': '2025-09-15 03:16:47.890124', 'step': 953, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:47.920165', 'step': 953, 'epoch': 2} {'type': 'loss', 'content': 0.0002864906855393201, 'timestamp': '2025-09-15 03:16:47.922554', 'step': 954, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:47.952164', 'step': 954, 'epoch': 2} {'type': 'loss', 'content': 0.0006765606231056154, 'timestamp': '2025-09-15 03:16:47.954372', 'step': 955, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:47.985059', 'step': 955, 'epoch': 2} {'type': 'loss', 'content': 0.00018442471628077328, 'timestamp': '2025-09-15 03:16:48.013313', 'step': 956, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:48.043057', 'step': 956, 'epoch': 2} {'type': 'loss', 'content': 0.003149544121697545, 'timestamp': '2025-09-15 03:16:48.045009', 'step': 957, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:48.074542', 'step': 957, 'epoch': 2} {'type': 'loss', 'content': 0.0019628521986305714, 'timestamp': '2025-09-15 03:16:48.076662', 'step': 958, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:48.106578', 'step': 958, 'epoch': 2} {'type': 'loss', 'content': 0.0011997123947367072, 'timestamp': '2025-09-15 03:16:48.108751', 'step': 959, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:48.138763', 'step': 959, 'epoch': 2} {'type': 'loss', 'content': 0.002039334736764431, 'timestamp': '2025-09-15 03:16:48.162668', 'step': 960, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:16:48.192851', 'step': 960, 'epoch': 2} {'type': 'loss', 'content': 8.800149953458458e-05, 'timestamp': '2025-09-15 03:16:48.195378', 'step': 961, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:48.225502', 'step': 961, 'epoch': 2} {'type': 'loss', 'content': 0.0015280414372682571, 'timestamp': '2025-09-15 03:16:48.227626', 'step': 962, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:48.257614', 'step': 962, 'epoch': 2} {'type': 'loss', 'content': 0.00011810220166807994, 'timestamp': '2025-09-15 03:16:48.260475', 'step': 963, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:48.290224', 'step': 963, 'epoch': 2} {'type': 'loss', 'content': 0.019818326458334923, 'timestamp': '2025-09-15 03:16:48.318361', 'step': 964, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:16:48.347748', 'step': 964, 'epoch': 2} {'type': 'loss', 'content': 0.0020504810381680727, 'timestamp': '2025-09-15 03:16:48.349696', 'step': 965, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:48.380277', 'step': 965, 'epoch': 2} {'type': 'loss', 'content': 0.0007288079359568655, 'timestamp': '2025-09-15 03:16:48.384560', 'step': 966, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:48.414652', 'step': 966, 'epoch': 2} {'type': 'loss', 'content': 0.0011788300471380353, 'timestamp': '2025-09-15 03:16:48.417759', 'step': 967, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:48.454870', 'step': 967, 'epoch': 2} {'type': 'loss', 'content': 0.0001325029879808426, 'timestamp': '2025-09-15 03:16:48.483130', 'step': 968, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:48.513336', 'step': 968, 'epoch': 2} {'type': 'loss', 'content': 0.0010475792223587632, 'timestamp': '2025-09-15 03:16:48.515610', 'step': 969, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:48.545754', 'step': 969, 'epoch': 2} {'type': 'loss', 'content': 0.0006575215957127512, 'timestamp': '2025-09-15 03:16:48.548624', 'step': 970, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:48.578607', 'step': 970, 'epoch': 2} {'type': 'loss', 'content': 7.66265657148324e-05, 'timestamp': '2025-09-15 03:16:48.583485', 'step': 971, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:48.613311', 'step': 971, 'epoch': 2} {'type': 'loss', 'content': 0.008739389479160309, 'timestamp': '2025-09-15 03:16:48.636856', 'step': 972, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:48.668338', 'step': 972, 'epoch': 2} {'type': 'loss', 'content': 0.00012891367077827454, 'timestamp': '2025-09-15 03:16:48.673295', 'step': 973, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:48.703191', 'step': 973, 'epoch': 2} {'type': 'loss', 'content': 9.70660476014018e-05, 'timestamp': '2025-09-15 03:16:48.707453', 'step': 974, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:48.737410', 'step': 974, 'epoch': 2} {'type': 'loss', 'content': 8.728348620934412e-05, 'timestamp': '2025-09-15 03:16:48.744365', 'step': 975, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:16:49.345637', 'step': 975, 'epoch': 2} {'type': 'pplx', 'content': 130597413.28941588, 'timestamp': '2025-09-15 03:16:49.350763', 'step': 975, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:49.379334', 'step': 975, 'epoch': 2} {'type': 'loss', 'content': 0.00420480826869607, 'timestamp': '2025-09-15 03:16:49.403089', 'step': 976, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:49.433144', 'step': 976, 'epoch': 2} {'type': 'loss', 'content': 0.032693080604076385, 'timestamp': '2025-09-15 03:16:49.435199', 'step': 977, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:49.465452', 'step': 977, 'epoch': 2} {'type': 'loss', 'content': 0.0009658890776336193, 'timestamp': '2025-09-15 03:16:49.467727', 'step': 978, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:49.497553', 'step': 978, 'epoch': 2} {'type': 'loss', 'content': 0.0034255923237651587, 'timestamp': '2025-09-15 03:16:49.504999', 'step': 979, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:49.535164', 'step': 979, 'epoch': 2} {'type': 'loss', 'content': 0.0006966963992454112, 'timestamp': '2025-09-15 03:16:49.559057', 'step': 980, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:16:49.590644', 'step': 980, 'epoch': 2} {'type': 'loss', 'content': 0.034857045859098434, 'timestamp': '2025-09-15 03:16:49.593073', 'step': 981, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:49.625258', 'step': 981, 'epoch': 2} {'type': 'loss', 'content': 0.0002292133867740631, 'timestamp': '2025-09-15 03:16:49.627739', 'step': 982, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-15 03:16:49.658448', 'step': 982, 'epoch': 2} {'type': 'loss', 'content': 8.10977871879004e-05, 'timestamp': '2025-09-15 03:16:49.666318', 'step': 983, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:49.696602', 'step': 983, 'epoch': 2} {'type': 'loss', 'content': 0.006245698779821396, 'timestamp': '2025-09-15 03:16:49.720473', 'step': 984, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:49.751066', 'step': 984, 'epoch': 2} {'type': 'loss', 'content': 0.0010302267037332058, 'timestamp': '2025-09-15 03:16:49.753521', 'step': 985, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:49.783624', 'step': 985, 'epoch': 2} {'type': 'loss', 'content': 0.0007164975395426154, 'timestamp': '2025-09-15 03:16:49.787905', 'step': 986, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:49.817434', 'step': 986, 'epoch': 2} {'type': 'loss', 'content': 0.0336216501891613, 'timestamp': '2025-09-15 03:16:49.819809', 'step': 987, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:49.850210', 'step': 987, 'epoch': 2} {'type': 'loss', 'content': 0.0003482494503259659, 'timestamp': '2025-09-15 03:16:49.873728', 'step': 988, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:49.904007', 'step': 988, 'epoch': 2} {'type': 'loss', 'content': 0.0004796452703885734, 'timestamp': '2025-09-15 03:16:49.906197', 'step': 989, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:49.936436', 'step': 989, 'epoch': 2} {'type': 'loss', 'content': 0.00031821359880268574, 'timestamp': '2025-09-15 03:16:49.938832', 'step': 990, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:49.969477', 'step': 990, 'epoch': 2} {'type': 'loss', 'content': 0.010607955977320671, 'timestamp': '2025-09-15 03:16:49.974161', 'step': 991, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:50.004515', 'step': 991, 'epoch': 2} {'type': 'loss', 'content': 0.0005565133760683239, 'timestamp': '2025-09-15 03:16:50.028045', 'step': 992, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:50.059228', 'step': 992, 'epoch': 2} {'type': 'loss', 'content': 0.005336018744856119, 'timestamp': '2025-09-15 03:16:50.061416', 'step': 993, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:50.092213', 'step': 993, 'epoch': 2} {'type': 'loss', 'content': 0.002163677243515849, 'timestamp': '2025-09-15 03:16:50.096736', 'step': 994, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:50.127355', 'step': 994, 'epoch': 2} {'type': 'loss', 'content': 0.008492656983435154, 'timestamp': '2025-09-15 03:16:50.129744', 'step': 995, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:50.160754', 'step': 995, 'epoch': 2} {'type': 'loss', 'content': 0.004409152548760176, 'timestamp': '2025-09-15 03:16:50.184430', 'step': 996, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:50.214949', 'step': 996, 'epoch': 2} {'type': 'loss', 'content': 0.04378414899110794, 'timestamp': '2025-09-15 03:16:50.216905', 'step': 997, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:50.247859', 'step': 997, 'epoch': 2} {'type': 'loss', 'content': 0.011165916919708252, 'timestamp': '2025-09-15 03:16:50.252116', 'step': 998, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:50.283378', 'step': 998, 'epoch': 2} {'type': 'loss', 'content': 0.00016595398483332247, 'timestamp': '2025-09-15 03:16:50.289967', 'step': 999, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:50.321182', 'step': 999, 'epoch': 2} {'type': 'loss', 'content': 0.0020952578634023666, 'timestamp': '2025-09-15 03:16:50.346121', 'step': 1000, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 1000', 'timestamp': '2025-09-15 03:16:57.070929', 'step': 1000, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:57.133918', 'step': 1000, 'epoch': 2} {'type': 'loss', 'content': 0.00026529579190537333, 'timestamp': '2025-09-15 03:16:57.137117', 'step': 1001, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:57.169836', 'step': 1001, 'epoch': 2} {'type': 'loss', 'content': 0.0021564450580626726, 'timestamp': '2025-09-15 03:16:57.173725', 'step': 1002, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:57.204137', 'step': 1002, 'epoch': 2} {'type': 'loss', 'content': 9.368791506858543e-05, 'timestamp': '2025-09-15 03:16:57.206838', 'step': 1003, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:57.237714', 'step': 1003, 'epoch': 2} {'type': 'loss', 'content': 0.0009809520561248064, 'timestamp': '2025-09-15 03:16:57.262247', 'step': 1004, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:57.293076', 'step': 1004, 'epoch': 2} {'type': 'loss', 'content': 0.003123172326013446, 'timestamp': '2025-09-15 03:16:57.297136', 'step': 1005, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:57.327447', 'step': 1005, 'epoch': 2} {'type': 'loss', 'content': 0.002402219222858548, 'timestamp': '2025-09-15 03:16:57.334030', 'step': 1006, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:57.364845', 'step': 1006, 'epoch': 2} {'type': 'loss', 'content': 0.00017964097787626088, 'timestamp': '2025-09-15 03:16:57.371356', 'step': 1007, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:57.403222', 'step': 1007, 'epoch': 2} {'type': 'loss', 'content': 0.0009959181770682335, 'timestamp': '2025-09-15 03:16:57.427266', 'step': 1008, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:57.458850', 'step': 1008, 'epoch': 2} {'type': 'loss', 'content': 0.013769718818366528, 'timestamp': '2025-09-15 03:16:57.463508', 'step': 1009, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:57.494685', 'step': 1009, 'epoch': 2} {'type': 'loss', 'content': 0.005754461046308279, 'timestamp': '2025-09-15 03:16:57.496945', 'step': 1010, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:57.527905', 'step': 1010, 'epoch': 2} {'type': 'loss', 'content': 0.013897545635700226, 'timestamp': '2025-09-15 03:16:57.529969', 'step': 1011, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:57.560739', 'step': 1011, 'epoch': 2} {'type': 'loss', 'content': 0.0009429440833628178, 'timestamp': '2025-09-15 03:16:57.584626', 'step': 1012, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:57.616118', 'step': 1012, 'epoch': 2} {'type': 'loss', 'content': 0.004884080495685339, 'timestamp': '2025-09-15 03:16:57.620063', 'step': 1013, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:57.652340', 'step': 1013, 'epoch': 2} {'type': 'loss', 'content': 0.00030809929012320936, 'timestamp': '2025-09-15 03:16:57.654556', 'step': 1014, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:16:58.246830', 'step': 1014, 'epoch': 2} {'type': 'pplx', 'content': 133013128.32087575, 'timestamp': '2025-09-15 03:16:58.248804', 'step': 1014, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:58.277860', 'step': 1014, 'epoch': 2} {'type': 'loss', 'content': 0.02146388776600361, 'timestamp': '2025-09-15 03:16:58.280024', 'step': 1015, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:16:58.311708', 'step': 1015, 'epoch': 2} {'type': 'loss', 'content': 0.00024146329087670892, 'timestamp': '2025-09-15 03:16:58.338981', 'step': 1016, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:58.369735', 'step': 1016, 'epoch': 2} {'type': 'loss', 'content': 0.0033533659297972918, 'timestamp': '2025-09-15 03:16:58.373452', 'step': 1017, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:58.406216', 'step': 1017, 'epoch': 2} {'type': 'loss', 'content': 0.0004804394266102463, 'timestamp': '2025-09-15 03:16:58.418074', 'step': 1018, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-15 03:16:58.453288', 'step': 1018, 'epoch': 2} {'type': 'loss', 'content': 9.127042721956968e-05, 'timestamp': '2025-09-15 03:16:58.462898', 'step': 1019, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:58.495023', 'step': 1019, 'epoch': 2} {'type': 'loss', 'content': 0.005294309463351965, 'timestamp': '2025-09-15 03:16:58.519323', 'step': 1020, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:58.553035', 'step': 1020, 'epoch': 2} {'type': 'loss', 'content': 0.034268833696842194, 'timestamp': '2025-09-15 03:16:58.557124', 'step': 1021, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:58.591270', 'step': 1021, 'epoch': 2} {'type': 'loss', 'content': 0.0009754736674949527, 'timestamp': '2025-09-15 03:16:58.605187', 'step': 1022, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:58.639155', 'step': 1022, 'epoch': 2} {'type': 'loss', 'content': 0.001966263633221388, 'timestamp': '2025-09-15 03:16:58.642459', 'step': 1023, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:58.673939', 'step': 1023, 'epoch': 2} {'type': 'loss', 'content': 0.021047789603471756, 'timestamp': '2025-09-15 03:16:58.699063', 'step': 1024, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:58.731595', 'step': 1024, 'epoch': 2} {'type': 'loss', 'content': 0.013026480562984943, 'timestamp': '2025-09-15 03:16:58.736599', 'step': 1025, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:58.767626', 'step': 1025, 'epoch': 2} {'type': 'loss', 'content': 0.006008670665323734, 'timestamp': '2025-09-15 03:16:58.774634', 'step': 1026, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:16:58.805623', 'step': 1026, 'epoch': 2} {'type': 'loss', 'content': 0.0004582749679684639, 'timestamp': '2025-09-15 03:16:58.807842', 'step': 1027, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-15 03:16:58.838737', 'step': 1027, 'epoch': 2} {'type': 'loss', 'content': 0.006711532361805439, 'timestamp': '2025-09-15 03:16:58.869644', 'step': 1028, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:58.902819', 'step': 1028, 'epoch': 2} {'type': 'loss', 'content': 0.0003885440528392792, 'timestamp': '2025-09-15 03:16:58.914280', 'step': 1029, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:58.959725', 'step': 1029, 'epoch': 2} {'type': 'loss', 'content': 0.0003321361436974257, 'timestamp': '2025-09-15 03:16:58.977293', 'step': 1030, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:59.020648', 'step': 1030, 'epoch': 2} {'type': 'loss', 'content': 0.0006997405434958637, 'timestamp': '2025-09-15 03:16:59.034078', 'step': 1031, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:59.080505', 'step': 1031, 'epoch': 2} {'type': 'loss', 'content': 0.00020196476543787867, 'timestamp': '2025-09-15 03:16:59.113059', 'step': 1032, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:59.146827', 'step': 1032, 'epoch': 2} {'type': 'loss', 'content': 0.0006217014160938561, 'timestamp': '2025-09-15 03:16:59.150555', 'step': 1033, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:59.182179', 'step': 1033, 'epoch': 2} {'type': 'loss', 'content': 0.00045512375072576106, 'timestamp': '2025-09-15 03:16:59.185915', 'step': 1034, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:59.216866', 'step': 1034, 'epoch': 2} {'type': 'loss', 'content': 0.0001574978668941185, 'timestamp': '2025-09-15 03:16:59.219089', 'step': 1035, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:59.250202', 'step': 1035, 'epoch': 2} {'type': 'loss', 'content': 0.0007082565571181476, 'timestamp': '2025-09-15 03:16:59.274443', 'step': 1036, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:16:59.305602', 'step': 1036, 'epoch': 2} {'type': 'loss', 'content': 0.0003342053678352386, 'timestamp': '2025-09-15 03:16:59.308300', 'step': 1037, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:16:59.339682', 'step': 1037, 'epoch': 2} {'type': 'loss', 'content': 0.0035582254640758038, 'timestamp': '2025-09-15 03:16:59.346312', 'step': 1038, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:16:59.377184', 'step': 1038, 'epoch': 2} {'type': 'loss', 'content': 0.0013459293404594064, 'timestamp': '2025-09-15 03:16:59.379689', 'step': 1039, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:59.411229', 'step': 1039, 'epoch': 2} {'type': 'loss', 'content': 0.0205552838742733, 'timestamp': '2025-09-15 03:16:59.439234', 'step': 1040, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:59.470110', 'step': 1040, 'epoch': 2} {'type': 'loss', 'content': 0.007523669395595789, 'timestamp': '2025-09-15 03:16:59.472328', 'step': 1041, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-15 03:16:59.505554', 'step': 1041, 'epoch': 2} {'type': 'loss', 'content': 0.00014045715215615928, 'timestamp': '2025-09-15 03:16:59.518990', 'step': 1042, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:59.556205', 'step': 1042, 'epoch': 2} {'type': 'loss', 'content': 0.006666823290288448, 'timestamp': '2025-09-15 03:16:59.558569', 'step': 1043, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:16:59.589159', 'step': 1043, 'epoch': 2} {'type': 'loss', 'content': 0.0002057207893813029, 'timestamp': '2025-09-15 03:16:59.621802', 'step': 1044, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:16:59.668732', 'step': 1044, 'epoch': 2} {'type': 'loss', 'content': 0.004083287436515093, 'timestamp': '2025-09-15 03:16:59.686645', 'step': 1045, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:16:59.731960', 'step': 1045, 'epoch': 2} {'type': 'loss', 'content': 0.004076201934367418, 'timestamp': '2025-09-15 03:16:59.745434', 'step': 1046, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:59.780555', 'step': 1046, 'epoch': 2} {'type': 'loss', 'content': 0.00010849825775949284, 'timestamp': '2025-09-15 03:16:59.782985', 'step': 1047, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:59.814671', 'step': 1047, 'epoch': 2} {'type': 'loss', 'content': 0.00018463960441295058, 'timestamp': '2025-09-15 03:16:59.838427', 'step': 1048, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:16:59.869922', 'step': 1048, 'epoch': 2} {'type': 'loss', 'content': 9.18033329071477e-05, 'timestamp': '2025-09-15 03:16:59.872883', 'step': 1049, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:59.904326', 'step': 1049, 'epoch': 2} {'type': 'loss', 'content': 0.0013023499632254243, 'timestamp': '2025-09-15 03:16:59.908682', 'step': 1050, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:16:59.942538', 'step': 1050, 'epoch': 2} {'type': 'loss', 'content': 0.00042243319330736995, 'timestamp': '2025-09-15 03:16:59.945859', 'step': 1051, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:16:59.975909', 'step': 1051, 'epoch': 2} {'type': 'loss', 'content': 0.0019100798526778817, 'timestamp': '2025-09-15 03:17:00.002158', 'step': 1052, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:00.042626', 'step': 1052, 'epoch': 2} {'type': 'loss', 'content': 0.004875184502452612, 'timestamp': '2025-09-15 03:17:00.053457', 'step': 1053, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:17:00.641377', 'step': 1053, 'epoch': 2} {'type': 'pplx', 'content': 128433445.53436927, 'timestamp': '2025-09-15 03:17:00.643332', 'step': 1053, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:17:00.672739', 'step': 1053, 'epoch': 2} {'type': 'loss', 'content': 0.00015715994231868535, 'timestamp': '2025-09-15 03:17:00.679798', 'step': 1054, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:00.710352', 'step': 1054, 'epoch': 2} {'type': 'loss', 'content': 0.0015639655757695436, 'timestamp': '2025-09-15 03:17:00.712423', 'step': 1055, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:00.742595', 'step': 1055, 'epoch': 2} {'type': 'loss', 'content': 0.001694310107268393, 'timestamp': '2025-09-15 03:17:00.770164', 'step': 1056, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:00.801004', 'step': 1056, 'epoch': 2} {'type': 'loss', 'content': 0.00971940066665411, 'timestamp': '2025-09-15 03:17:00.803232', 'step': 1057, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:00.833076', 'step': 1057, 'epoch': 2} {'type': 'loss', 'content': 0.010265431366860867, 'timestamp': '2025-09-15 03:17:00.835373', 'step': 1058, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-15 03:17:00.865884', 'step': 1058, 'epoch': 2} {'type': 'loss', 'content': 0.001557617331854999, 'timestamp': '2025-09-15 03:17:00.873290', 'step': 1059, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:00.904138', 'step': 1059, 'epoch': 2} {'type': 'loss', 'content': 0.0001345228374702856, 'timestamp': '2025-09-15 03:17:00.927733', 'step': 1060, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:17:00.959287', 'step': 1060, 'epoch': 2} {'type': 'loss', 'content': 0.0007394430576823652, 'timestamp': '2025-09-15 03:17:00.963858', 'step': 1061, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:00.994856', 'step': 1061, 'epoch': 2} {'type': 'loss', 'content': 0.0004073964955750853, 'timestamp': '2025-09-15 03:17:00.997074', 'step': 1062, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:01.028350', 'step': 1062, 'epoch': 2} {'type': 'loss', 'content': 0.004258634988218546, 'timestamp': '2025-09-15 03:17:01.030874', 'step': 1063, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:01.061520', 'step': 1063, 'epoch': 2} {'type': 'loss', 'content': 0.0001565528364153579, 'timestamp': '2025-09-15 03:17:01.089055', 'step': 1064, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:01.120900', 'step': 1064, 'epoch': 2} {'type': 'loss', 'content': 0.01607772521674633, 'timestamp': '2025-09-15 03:17:01.123003', 'step': 1065, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:17:01.154524', 'step': 1065, 'epoch': 2} {'type': 'loss', 'content': 0.00016873667482286692, 'timestamp': '2025-09-15 03:17:01.161852', 'step': 1066, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:01.193229', 'step': 1066, 'epoch': 2} {'type': 'loss', 'content': 0.00010801710595842451, 'timestamp': '2025-09-15 03:17:01.195342', 'step': 1067, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:01.226064', 'step': 1067, 'epoch': 2} {'type': 'loss', 'content': 6.337455852190033e-05, 'timestamp': '2025-09-15 03:17:01.249880', 'step': 1068, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:01.280909', 'step': 1068, 'epoch': 2} {'type': 'loss', 'content': 7.294497481780127e-05, 'timestamp': '2025-09-15 03:17:01.283131', 'step': 1069, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:01.314083', 'step': 1069, 'epoch': 2} {'type': 'loss', 'content': 0.00018661771900951862, 'timestamp': '2025-09-15 03:17:01.316371', 'step': 1070, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:01.347410', 'step': 1070, 'epoch': 2} {'type': 'loss', 'content': 7.31277177692391e-05, 'timestamp': '2025-09-15 03:17:01.349480', 'step': 1071, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:01.379818', 'step': 1071, 'epoch': 2} {'type': 'loss', 'content': 0.003055393695831299, 'timestamp': '2025-09-15 03:17:01.403502', 'step': 1072, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:01.434488', 'step': 1072, 'epoch': 2} {'type': 'loss', 'content': 0.0011397640919312835, 'timestamp': '2025-09-15 03:17:01.436578', 'step': 1073, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:01.466626', 'step': 1073, 'epoch': 2} {'type': 'loss', 'content': 0.00018760105012916028, 'timestamp': '2025-09-15 03:17:01.469116', 'step': 1074, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:01.499841', 'step': 1074, 'epoch': 2} {'type': 'loss', 'content': 0.03531399741768837, 'timestamp': '2025-09-15 03:17:01.502125', 'step': 1075, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:01.533233', 'step': 1075, 'epoch': 2} {'type': 'loss', 'content': 0.0020788447000086308, 'timestamp': '2025-09-15 03:17:01.558353', 'step': 1076, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:01.589433', 'step': 1076, 'epoch': 2} {'type': 'loss', 'content': 0.006431459914892912, 'timestamp': '2025-09-15 03:17:01.591636', 'step': 1077, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:01.622417', 'step': 1077, 'epoch': 2} {'type': 'loss', 'content': 0.00033594778506085277, 'timestamp': '2025-09-15 03:17:01.626190', 'step': 1078, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:01.656675', 'step': 1078, 'epoch': 2} {'type': 'loss', 'content': 0.000293986959150061, 'timestamp': '2025-09-15 03:17:01.663368', 'step': 1079, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-15 03:17:01.694947', 'step': 1079, 'epoch': 2} {'type': 'loss', 'content': 6.315172504400834e-05, 'timestamp': '2025-09-15 03:17:01.725903', 'step': 1080, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:01.756602', 'step': 1080, 'epoch': 2} {'type': 'loss', 'content': 0.0007892033900134265, 'timestamp': '2025-09-15 03:17:01.761025', 'step': 1081, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:01.791155', 'step': 1081, 'epoch': 2} {'type': 'loss', 'content': 0.0016475154552608728, 'timestamp': '2025-09-15 03:17:01.793188', 'step': 1082, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:01.823978', 'step': 1082, 'epoch': 2} {'type': 'loss', 'content': 0.00014074041973799467, 'timestamp': '2025-09-15 03:17:01.825973', 'step': 1083, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:01.857526', 'step': 1083, 'epoch': 2} {'type': 'loss', 'content': 0.010327513329684734, 'timestamp': '2025-09-15 03:17:01.881062', 'step': 1084, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:01.912737', 'step': 1084, 'epoch': 2} {'type': 'loss', 'content': 0.03140605241060257, 'timestamp': '2025-09-15 03:17:01.914853', 'step': 1085, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:01.945337', 'step': 1085, 'epoch': 2} {'type': 'loss', 'content': 0.00010317912528989837, 'timestamp': '2025-09-15 03:17:01.947950', 'step': 1086, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:01.978531', 'step': 1086, 'epoch': 2} {'type': 'loss', 'content': 0.00036211867700330913, 'timestamp': '2025-09-15 03:17:01.981393', 'step': 1087, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:02.011645', 'step': 1087, 'epoch': 2} {'type': 'loss', 'content': 0.0008095457451418042, 'timestamp': '2025-09-15 03:17:02.036809', 'step': 1088, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:02.067225', 'step': 1088, 'epoch': 2} {'type': 'loss', 'content': 0.0014814898604527116, 'timestamp': '2025-09-15 03:17:02.069343', 'step': 1089, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:02.100634', 'step': 1089, 'epoch': 2} {'type': 'loss', 'content': 0.0004058975900989026, 'timestamp': '2025-09-15 03:17:02.104219', 'step': 1090, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:02.134852', 'step': 1090, 'epoch': 2} {'type': 'loss', 'content': 0.0005335028981789947, 'timestamp': '2025-09-15 03:17:02.137050', 'step': 1091, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:02.168095', 'step': 1091, 'epoch': 2} {'type': 'loss', 'content': 7.210144394775853e-05, 'timestamp': '2025-09-15 03:17:02.191705', 'step': 1092, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:17:02.783176', 'step': 1092, 'epoch': 2} {'type': 'pplx', 'content': 125546625.83764112, 'timestamp': '2025-09-15 03:17:02.785083', 'step': 1092, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:02.814532', 'step': 1092, 'epoch': 2} {'type': 'loss', 'content': 0.00045872549526393414, 'timestamp': '2025-09-15 03:17:02.816946', 'step': 1093, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:02.849473', 'step': 1093, 'epoch': 2} {'type': 'loss', 'content': 0.0002017955994233489, 'timestamp': '2025-09-15 03:17:02.851516', 'step': 1094, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:02.882138', 'step': 1094, 'epoch': 2} {'type': 'loss', 'content': 0.014432324096560478, 'timestamp': '2025-09-15 03:17:02.886230', 'step': 1095, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:02.917625', 'step': 1095, 'epoch': 2} {'type': 'loss', 'content': 0.00027981828316114843, 'timestamp': '2025-09-15 03:17:02.945326', 'step': 1096, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:02.976326', 'step': 1096, 'epoch': 2} {'type': 'loss', 'content': 0.00010119278158526868, 'timestamp': '2025-09-15 03:17:02.978535', 'step': 1097, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:03.016790', 'step': 1097, 'epoch': 2} {'type': 'loss', 'content': 0.0008557759574614465, 'timestamp': '2025-09-15 03:17:03.021161', 'step': 1098, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:03.052725', 'step': 1098, 'epoch': 2} {'type': 'loss', 'content': 0.0012092305114492774, 'timestamp': '2025-09-15 03:17:03.059485', 'step': 1099, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:03.089674', 'step': 1099, 'epoch': 2} {'type': 'loss', 'content': 0.02371017076075077, 'timestamp': '2025-09-15 03:17:03.113295', 'step': 1100, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:03.146225', 'step': 1100, 'epoch': 2} {'type': 'loss', 'content': 0.0003077341243624687, 'timestamp': '2025-09-15 03:17:03.148479', 'step': 1101, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:03.179937', 'step': 1101, 'epoch': 2} {'type': 'loss', 'content': 0.0024734383914619684, 'timestamp': '2025-09-15 03:17:03.183518', 'step': 1102, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:03.214077', 'step': 1102, 'epoch': 2} {'type': 'loss', 'content': 0.0008149113273248076, 'timestamp': '2025-09-15 03:17:03.216186', 'step': 1103, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:03.248301', 'step': 1103, 'epoch': 2} {'type': 'loss', 'content': 0.0012873790692538023, 'timestamp': '2025-09-15 03:17:03.272061', 'step': 1104, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:03.302760', 'step': 1104, 'epoch': 2} {'type': 'loss', 'content': 0.00025210640160366893, 'timestamp': '2025-09-15 03:17:03.305310', 'step': 1105, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:03.336477', 'step': 1105, 'epoch': 2} {'type': 'loss', 'content': 0.0013830851530656219, 'timestamp': '2025-09-15 03:17:03.340368', 'step': 1106, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:03.371161', 'step': 1106, 'epoch': 2} {'type': 'loss', 'content': 0.0004376484430395067, 'timestamp': '2025-09-15 03:17:03.373884', 'step': 1107, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-15 03:17:03.405828', 'step': 1107, 'epoch': 2} {'type': 'loss', 'content': 0.005435534752905369, 'timestamp': '2025-09-15 03:17:03.436659', 'step': 1108, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:03.467201', 'step': 1108, 'epoch': 2} {'type': 'loss', 'content': 0.007525573950260878, 'timestamp': '2025-09-15 03:17:03.469369', 'step': 1109, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:03.499274', 'step': 1109, 'epoch': 2} {'type': 'loss', 'content': 0.0014803019585087895, 'timestamp': '2025-09-15 03:17:03.501839', 'step': 1110, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:03.532383', 'step': 1110, 'epoch': 2} {'type': 'loss', 'content': 0.0016765117179602385, 'timestamp': '2025-09-15 03:17:03.534631', 'step': 1111, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:03.565067', 'step': 1111, 'epoch': 2} {'type': 'loss', 'content': 0.0002567689516581595, 'timestamp': '2025-09-15 03:17:03.588859', 'step': 1112, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:03.620315', 'step': 1112, 'epoch': 2} {'type': 'loss', 'content': 0.00035767193185165524, 'timestamp': '2025-09-15 03:17:03.623949', 'step': 1113, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:03.654710', 'step': 1113, 'epoch': 2} {'type': 'loss', 'content': 0.002797603141516447, 'timestamp': '2025-09-15 03:17:03.656797', 'step': 1114, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:03.687752', 'step': 1114, 'epoch': 2} {'type': 'loss', 'content': 0.00018568325322121382, 'timestamp': '2025-09-15 03:17:03.689987', 'step': 1115, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-15 03:17:03.722101', 'step': 1115, 'epoch': 2} {'type': 'loss', 'content': 0.00015704214456491172, 'timestamp': '2025-09-15 03:17:03.750370', 'step': 1116, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:03.781058', 'step': 1116, 'epoch': 2} {'type': 'loss', 'content': 0.001136482460424304, 'timestamp': '2025-09-15 03:17:03.783206', 'step': 1117, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:03.813877', 'step': 1117, 'epoch': 2} {'type': 'loss', 'content': 0.0017352696740999818, 'timestamp': '2025-09-15 03:17:03.816024', 'step': 1118, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:03.847254', 'step': 1118, 'epoch': 2} {'type': 'loss', 'content': 0.00021176310838200152, 'timestamp': '2025-09-15 03:17:03.851378', 'step': 1119, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:03.882018', 'step': 1119, 'epoch': 2} {'type': 'loss', 'content': 0.01274438202381134, 'timestamp': '2025-09-15 03:17:03.905831', 'step': 1120, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-15 03:17:03.936972', 'step': 1120, 'epoch': 2} {'type': 'loss', 'content': 0.0004901143256574869, 'timestamp': '2025-09-15 03:17:03.942212', 'step': 1121, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:03.972830', 'step': 1121, 'epoch': 2} {'type': 'loss', 'content': 0.0018064085161313415, 'timestamp': '2025-09-15 03:17:03.974979', 'step': 1122, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:04.006128', 'step': 1122, 'epoch': 2} {'type': 'loss', 'content': 0.004721564706414938, 'timestamp': '2025-09-15 03:17:04.012697', 'step': 1123, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:04.043430', 'step': 1123, 'epoch': 2} {'type': 'loss', 'content': 0.022109517827630043, 'timestamp': '2025-09-15 03:17:04.066826', 'step': 1124, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:04.097736', 'step': 1124, 'epoch': 2} {'type': 'loss', 'content': 0.00010318388376617804, 'timestamp': '2025-09-15 03:17:04.099961', 'step': 1125, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-15 03:17:04.130625', 'step': 1125, 'epoch': 2} {'type': 'loss', 'content': 0.0013412336120381951, 'timestamp': '2025-09-15 03:17:04.142684', 'step': 1126, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:04.173163', 'step': 1126, 'epoch': 2} {'type': 'loss', 'content': 8.792152220848948e-05, 'timestamp': '2025-09-15 03:17:04.177172', 'step': 1127, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:04.207499', 'step': 1127, 'epoch': 2} {'type': 'loss', 'content': 0.014445015229284763, 'timestamp': '2025-09-15 03:17:04.232346', 'step': 1128, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:04.262924', 'step': 1128, 'epoch': 2} {'type': 'loss', 'content': 0.0215358417481184, 'timestamp': '2025-09-15 03:17:04.264843', 'step': 1129, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:17:04.295527', 'step': 1129, 'epoch': 2} {'type': 'loss', 'content': 0.0004923886153846979, 'timestamp': '2025-09-15 03:17:04.302837', 'step': 1130, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:04.333266', 'step': 1130, 'epoch': 2} {'type': 'loss', 'content': 0.00025934947188943624, 'timestamp': '2025-09-15 03:17:04.335529', 'step': 1131, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:17:04.942642', 'step': 1131, 'epoch': 2} {'type': 'pplx', 'content': 122099526.47280383, 'timestamp': '2025-09-15 03:17:04.944835', 'step': 1131, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:04.974377', 'step': 1131, 'epoch': 2} {'type': 'loss', 'content': 0.00045106312609277666, 'timestamp': '2025-09-15 03:17:04.998323', 'step': 1132, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:05.029878', 'step': 1132, 'epoch': 2} {'type': 'loss', 'content': 0.00016361901361960918, 'timestamp': '2025-09-15 03:17:05.034062', 'step': 1133, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:05.064589', 'step': 1133, 'epoch': 2} {'type': 'loss', 'content': 0.0002141120348824188, 'timestamp': '2025-09-15 03:17:05.066971', 'step': 1134, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:05.097692', 'step': 1134, 'epoch': 2} {'type': 'loss', 'content': 0.0009333607158623636, 'timestamp': '2025-09-15 03:17:05.101781', 'step': 1135, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-15 03:17:05.132724', 'step': 1135, 'epoch': 2} {'type': 'loss', 'content': 0.0034767123870551586, 'timestamp': '2025-09-15 03:17:05.161155', 'step': 1136, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:05.192239', 'step': 1136, 'epoch': 2} {'type': 'loss', 'content': 0.0001437151659047231, 'timestamp': '2025-09-15 03:17:05.194436', 'step': 1137, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:05.224976', 'step': 1137, 'epoch': 2} {'type': 'loss', 'content': 0.0009071500971913338, 'timestamp': '2025-09-15 03:17:05.231972', 'step': 1138, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 80], 'flops': 2373281365952}, 'timestamp': '2025-09-15 03:17:05.262259', 'step': 1138, 'epoch': 2} {'type': 'loss', 'content': 0.0008537854300811887, 'timestamp': '2025-09-15 03:17:05.264356', 'step': 1139, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:05.294797', 'step': 1139, 'epoch': 2} {'type': 'loss', 'content': 0.0015601081540808082, 'timestamp': '2025-09-15 03:17:05.318331', 'step': 1140, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:05.349035', 'step': 1140, 'epoch': 2} {'type': 'loss', 'content': 0.0004569217562675476, 'timestamp': '2025-09-15 03:17:05.353546', 'step': 1141, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:05.385607', 'step': 1141, 'epoch': 2} {'type': 'loss', 'content': 0.009549853391945362, 'timestamp': '2025-09-15 03:17:05.389555', 'step': 1142, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:05.420992', 'step': 1142, 'epoch': 2} {'type': 'loss', 'content': 0.00030883742147125304, 'timestamp': '2025-09-15 03:17:05.423042', 'step': 1143, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:05.453101', 'step': 1143, 'epoch': 2} {'type': 'loss', 'content': 0.005748115945607424, 'timestamp': '2025-09-15 03:17:05.476823', 'step': 1144, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:05.507371', 'step': 1144, 'epoch': 2} {'type': 'loss', 'content': 0.0008632918470539153, 'timestamp': '2025-09-15 03:17:05.509481', 'step': 1145, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:05.539429', 'step': 1145, 'epoch': 2} {'type': 'loss', 'content': 0.003175254911184311, 'timestamp': '2025-09-15 03:17:05.543879', 'step': 1146, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:05.575058', 'step': 1146, 'epoch': 2} {'type': 'loss', 'content': 0.00045585259795188904, 'timestamp': '2025-09-15 03:17:05.581557', 'step': 1147, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:05.612199', 'step': 1147, 'epoch': 2} {'type': 'loss', 'content': 0.005220616701990366, 'timestamp': '2025-09-15 03:17:05.635904', 'step': 1148, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:05.667118', 'step': 1148, 'epoch': 2} {'type': 'loss', 'content': 0.005695756059139967, 'timestamp': '2025-09-15 03:17:05.669117', 'step': 1149, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:05.699733', 'step': 1149, 'epoch': 2} {'type': 'loss', 'content': 0.005306906998157501, 'timestamp': '2025-09-15 03:17:05.706426', 'step': 1150, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:05.737014', 'step': 1150, 'epoch': 2} {'type': 'loss', 'content': 0.00037352906656451523, 'timestamp': '2025-09-15 03:17:05.743739', 'step': 1151, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:05.774727', 'step': 1151, 'epoch': 2} {'type': 'loss', 'content': 0.030560487881302834, 'timestamp': '2025-09-15 03:17:05.798353', 'step': 1152, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:05.828614', 'step': 1152, 'epoch': 2} {'type': 'loss', 'content': 0.002387533662840724, 'timestamp': '2025-09-15 03:17:05.830772', 'step': 1153, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:05.861247', 'step': 1153, 'epoch': 2} {'type': 'loss', 'content': 0.0002033365744864568, 'timestamp': '2025-09-15 03:17:05.863116', 'step': 1154, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:05.893569', 'step': 1154, 'epoch': 2} {'type': 'loss', 'content': 6.287390715442598e-05, 'timestamp': '2025-09-15 03:17:05.895757', 'step': 1155, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:05.926717', 'step': 1155, 'epoch': 2} {'type': 'loss', 'content': 0.0009798947721719742, 'timestamp': '2025-09-15 03:17:05.950287', 'step': 1156, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:05.982146', 'step': 1156, 'epoch': 2} {'type': 'loss', 'content': 7.104845280991867e-05, 'timestamp': '2025-09-15 03:17:05.984187', 'step': 1157, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:06.014629', 'step': 1157, 'epoch': 2} {'type': 'loss', 'content': 8.981012069853023e-05, 'timestamp': '2025-09-15 03:17:06.016817', 'step': 1158, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:06.048115', 'step': 1158, 'epoch': 2} {'type': 'loss', 'content': 0.0008855522028170526, 'timestamp': '2025-09-15 03:17:06.051843', 'step': 1159, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:06.083815', 'step': 1159, 'epoch': 2} {'type': 'loss', 'content': 0.005247277207672596, 'timestamp': '2025-09-15 03:17:06.107239', 'step': 1160, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:06.138635', 'step': 1160, 'epoch': 2} {'type': 'loss', 'content': 0.003661633934825659, 'timestamp': '2025-09-15 03:17:06.140772', 'step': 1161, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:06.171486', 'step': 1161, 'epoch': 2} {'type': 'loss', 'content': 0.004740457516163588, 'timestamp': '2025-09-15 03:17:06.178110', 'step': 1162, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:06.209012', 'step': 1162, 'epoch': 2} {'type': 'loss', 'content': 0.0006330661126412451, 'timestamp': '2025-09-15 03:17:06.212899', 'step': 1163, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:06.243996', 'step': 1163, 'epoch': 2} {'type': 'loss', 'content': 0.029458897188305855, 'timestamp': '2025-09-15 03:17:06.271491', 'step': 1164, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:06.301940', 'step': 1164, 'epoch': 2} {'type': 'loss', 'content': 0.0018987554358318448, 'timestamp': '2025-09-15 03:17:06.304019', 'step': 1165, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:06.334234', 'step': 1165, 'epoch': 2} {'type': 'loss', 'content': 0.02847990207374096, 'timestamp': '2025-09-15 03:17:06.338249', 'step': 1166, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:06.369426', 'step': 1166, 'epoch': 2} {'type': 'loss', 'content': 0.003378561232239008, 'timestamp': '2025-09-15 03:17:06.371776', 'step': 1167, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:06.402487', 'step': 1167, 'epoch': 2} {'type': 'loss', 'content': 0.00012590606638696045, 'timestamp': '2025-09-15 03:17:06.426082', 'step': 1168, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:06.456755', 'step': 1168, 'epoch': 2} {'type': 'loss', 'content': 0.008719941601157188, 'timestamp': '2025-09-15 03:17:06.459685', 'step': 1169, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:06.491056', 'step': 1169, 'epoch': 2} {'type': 'loss', 'content': 0.0001788309746189043, 'timestamp': '2025-09-15 03:17:06.497885', 'step': 1170, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:17:07.090935', 'step': 1170, 'epoch': 2} {'type': 'pplx', 'content': 120217233.33391637, 'timestamp': '2025-09-15 03:17:07.092896', 'step': 1170, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:07.123131', 'step': 1170, 'epoch': 2} {'type': 'loss', 'content': 0.0017449696315452456, 'timestamp': '2025-09-15 03:17:07.125347', 'step': 1171, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:07.156096', 'step': 1171, 'epoch': 2} {'type': 'loss', 'content': 5.670529208146036e-05, 'timestamp': '2025-09-15 03:17:07.179700', 'step': 1172, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:07.210374', 'step': 1172, 'epoch': 2} {'type': 'loss', 'content': 0.002883358160033822, 'timestamp': '2025-09-15 03:17:07.212866', 'step': 1173, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:07.243655', 'step': 1173, 'epoch': 2} {'type': 'loss', 'content': 0.00012270697334315628, 'timestamp': '2025-09-15 03:17:07.245616', 'step': 1174, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:07.276156', 'step': 1174, 'epoch': 2} {'type': 'loss', 'content': 0.00018073969113174826, 'timestamp': '2025-09-15 03:17:07.282715', 'step': 1175, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:07.313486', 'step': 1175, 'epoch': 2} {'type': 'loss', 'content': 0.00023215598776005208, 'timestamp': '2025-09-15 03:17:07.337087', 'step': 1176, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:07.367681', 'step': 1176, 'epoch': 2} {'type': 'loss', 'content': 0.0001968259020941332, 'timestamp': '2025-09-15 03:17:07.370962', 'step': 1177, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:07.401091', 'step': 1177, 'epoch': 2} {'type': 'loss', 'content': 0.00012490291555877775, 'timestamp': '2025-09-15 03:17:07.403092', 'step': 1178, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:07.433654', 'step': 1178, 'epoch': 2} {'type': 'loss', 'content': 0.0007915376918390393, 'timestamp': '2025-09-15 03:17:07.437907', 'step': 1179, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:07.468907', 'step': 1179, 'epoch': 2} {'type': 'loss', 'content': 0.0008563162409700453, 'timestamp': '2025-09-15 03:17:07.494042', 'step': 1180, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:07.524473', 'step': 1180, 'epoch': 2} {'type': 'loss', 'content': 0.000734034925699234, 'timestamp': '2025-09-15 03:17:07.526753', 'step': 1181, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:07.557402', 'step': 1181, 'epoch': 2} {'type': 'loss', 'content': 0.0029953443445265293, 'timestamp': '2025-09-15 03:17:07.559647', 'step': 1182, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:07.590136', 'step': 1182, 'epoch': 2} {'type': 'loss', 'content': 0.00032776501029729843, 'timestamp': '2025-09-15 03:17:07.593866', 'step': 1183, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:07.625250', 'step': 1183, 'epoch': 2} {'type': 'loss', 'content': 0.003944663796573877, 'timestamp': '2025-09-15 03:17:07.652739', 'step': 1184, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:07.685969', 'step': 1184, 'epoch': 2} {'type': 'loss', 'content': 0.0007250604103319347, 'timestamp': '2025-09-15 03:17:07.690343', 'step': 1185, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:07.720759', 'step': 1185, 'epoch': 2} {'type': 'loss', 'content': 0.00011083681602030993, 'timestamp': '2025-09-15 03:17:07.722904', 'step': 1186, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:07.753963', 'step': 1186, 'epoch': 2} {'type': 'loss', 'content': 0.0006312718614935875, 'timestamp': '2025-09-15 03:17:07.758226', 'step': 1187, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:07.789002', 'step': 1187, 'epoch': 2} {'type': 'loss', 'content': 0.0006342697306536138, 'timestamp': '2025-09-15 03:17:07.816687', 'step': 1188, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:07.847386', 'step': 1188, 'epoch': 2} {'type': 'loss', 'content': 0.006963182706385851, 'timestamp': '2025-09-15 03:17:07.849520', 'step': 1189, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:07.879628', 'step': 1189, 'epoch': 2} {'type': 'loss', 'content': 0.00021931444643996656, 'timestamp': '2025-09-15 03:17:07.881968', 'step': 1190, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:07.913665', 'step': 1190, 'epoch': 2} {'type': 'loss', 'content': 0.00013834961282555014, 'timestamp': '2025-09-15 03:17:07.920242', 'step': 1191, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:07.951175', 'step': 1191, 'epoch': 2} {'type': 'loss', 'content': 7.177861698437482e-05, 'timestamp': '2025-09-15 03:17:07.974898', 'step': 1192, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:08.006731', 'step': 1192, 'epoch': 2} {'type': 'loss', 'content': 0.0039025992155075073, 'timestamp': '2025-09-15 03:17:08.008953', 'step': 1193, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:08.039818', 'step': 1193, 'epoch': 2} {'type': 'loss', 'content': 8.208167128032073e-05, 'timestamp': '2025-09-15 03:17:08.046515', 'step': 1194, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:08.077571', 'step': 1194, 'epoch': 2} {'type': 'loss', 'content': 0.00020119234977755696, 'timestamp': '2025-09-15 03:17:08.081437', 'step': 1195, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:08.113336', 'step': 1195, 'epoch': 2} {'type': 'loss', 'content': 0.002922703977674246, 'timestamp': '2025-09-15 03:17:08.137106', 'step': 1196, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:08.167695', 'step': 1196, 'epoch': 2} {'type': 'loss', 'content': 5.8547982916934416e-05, 'timestamp': '2025-09-15 03:17:08.171827', 'step': 1197, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:17:08.203339', 'step': 1197, 'epoch': 2} {'type': 'loss', 'content': 0.005056845489889383, 'timestamp': '2025-09-15 03:17:08.210471', 'step': 1198, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:08.241147', 'step': 1198, 'epoch': 2} {'type': 'loss', 'content': 5.344694000086747e-05, 'timestamp': '2025-09-15 03:17:08.243467', 'step': 1199, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:08.274387', 'step': 1199, 'epoch': 2} {'type': 'loss', 'content': 0.0004614375939127058, 'timestamp': '2025-09-15 03:17:08.297865', 'step': 1200, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:08.328913', 'step': 1200, 'epoch': 2} {'type': 'loss', 'content': 0.0005172430537641048, 'timestamp': '2025-09-15 03:17:08.331081', 'step': 1201, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:08.361990', 'step': 1201, 'epoch': 2} {'type': 'loss', 'content': 0.0038748490624129772, 'timestamp': '2025-09-15 03:17:08.368640', 'step': 1202, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:08.400412', 'step': 1202, 'epoch': 2} {'type': 'loss', 'content': 0.00013763025344815105, 'timestamp': '2025-09-15 03:17:08.404251', 'step': 1203, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:08.434970', 'step': 1203, 'epoch': 2} {'type': 'loss', 'content': 0.00014176352124195546, 'timestamp': '2025-09-15 03:17:08.460040', 'step': 1204, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:17:08.490957', 'step': 1204, 'epoch': 2} {'type': 'loss', 'content': 6.545721407746896e-05, 'timestamp': '2025-09-15 03:17:08.496008', 'step': 1205, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:08.526868', 'step': 1205, 'epoch': 2} {'type': 'loss', 'content': 0.00043702914263121784, 'timestamp': '2025-09-15 03:17:08.530901', 'step': 1206, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:08.561892', 'step': 1206, 'epoch': 2} {'type': 'loss', 'content': 0.00014172600640449673, 'timestamp': '2025-09-15 03:17:08.565895', 'step': 1207, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:08.596726', 'step': 1207, 'epoch': 2} {'type': 'loss', 'content': 0.010499807074666023, 'timestamp': '2025-09-15 03:17:08.620459', 'step': 1208, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:08.651516', 'step': 1208, 'epoch': 2} {'type': 'loss', 'content': 0.00040008421638049185, 'timestamp': '2025-09-15 03:17:08.653846', 'step': 1209, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:17:09.242304', 'step': 1209, 'epoch': 2} {'type': 'pplx', 'content': 126998915.8734457, 'timestamp': '2025-09-15 03:17:09.244291', 'step': 1209, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:09.272913', 'step': 1209, 'epoch': 2} {'type': 'loss', 'content': 0.0004073081654496491, 'timestamp': '2025-09-15 03:17:09.277152', 'step': 1210, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:09.307353', 'step': 1210, 'epoch': 2} {'type': 'loss', 'content': 0.01717802695930004, 'timestamp': '2025-09-15 03:17:09.309297', 'step': 1211, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:09.339861', 'step': 1211, 'epoch': 2} {'type': 'loss', 'content': 0.0004054214514326304, 'timestamp': '2025-09-15 03:17:09.364516', 'step': 1212, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:09.394786', 'step': 1212, 'epoch': 2} {'type': 'loss', 'content': 0.00017672948888503015, 'timestamp': '2025-09-15 03:17:09.397424', 'step': 1213, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:09.429154', 'step': 1213, 'epoch': 2} {'type': 'loss', 'content': 0.00019165755657013506, 'timestamp': '2025-09-15 03:17:09.433302', 'step': 1214, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:09.464113', 'step': 1214, 'epoch': 2} {'type': 'loss', 'content': 3.5844284866470844e-05, 'timestamp': '2025-09-15 03:17:09.466394', 'step': 1215, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:09.497093', 'step': 1215, 'epoch': 2} {'type': 'loss', 'content': 0.00017610486247576773, 'timestamp': '2025-09-15 03:17:09.520789', 'step': 1216, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:09.551283', 'step': 1216, 'epoch': 2} {'type': 'loss', 'content': 8.69460855028592e-05, 'timestamp': '2025-09-15 03:17:09.553499', 'step': 1217, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:09.584416', 'step': 1217, 'epoch': 2} {'type': 'loss', 'content': 0.0020858903881162405, 'timestamp': '2025-09-15 03:17:09.586838', 'step': 1218, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:09.617183', 'step': 1218, 'epoch': 2} {'type': 'loss', 'content': 0.00037857977440580726, 'timestamp': '2025-09-15 03:17:09.619561', 'step': 1219, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:09.650274', 'step': 1219, 'epoch': 2} {'type': 'loss', 'content': 0.00012488874199334532, 'timestamp': '2025-09-15 03:17:09.673861', 'step': 1220, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:09.704428', 'step': 1220, 'epoch': 2} {'type': 'loss', 'content': 5.675610373145901e-05, 'timestamp': '2025-09-15 03:17:09.706697', 'step': 1221, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:09.737747', 'step': 1221, 'epoch': 2} {'type': 'loss', 'content': 0.00016086138202808797, 'timestamp': '2025-09-15 03:17:09.739851', 'step': 1222, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-15 03:17:09.770677', 'step': 1222, 'epoch': 2} {'type': 'loss', 'content': 0.013607896864414215, 'timestamp': '2025-09-15 03:17:09.778253', 'step': 1223, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:09.809227', 'step': 1223, 'epoch': 2} {'type': 'loss', 'content': 0.03053331933915615, 'timestamp': '2025-09-15 03:17:09.832750', 'step': 1224, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:09.864133', 'step': 1224, 'epoch': 2} {'type': 'loss', 'content': 0.00015589622489642352, 'timestamp': '2025-09-15 03:17:09.868266', 'step': 1225, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:09.899025', 'step': 1225, 'epoch': 2} {'type': 'loss', 'content': 0.0008411963353864849, 'timestamp': '2025-09-15 03:17:09.902154', 'step': 1226, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:09.933017', 'step': 1226, 'epoch': 2} {'type': 'loss', 'content': 0.00041311918175779283, 'timestamp': '2025-09-15 03:17:09.937236', 'step': 1227, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:09.967738', 'step': 1227, 'epoch': 2} {'type': 'loss', 'content': 0.0003874626418109983, 'timestamp': '2025-09-15 03:17:09.991429', 'step': 1228, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:10.022233', 'step': 1228, 'epoch': 2} {'type': 'loss', 'content': 0.0001830274413805455, 'timestamp': '2025-09-15 03:17:10.024501', 'step': 1229, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:10.055105', 'step': 1229, 'epoch': 2} {'type': 'loss', 'content': 0.05192428082227707, 'timestamp': '2025-09-15 03:17:10.057900', 'step': 1230, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:10.088440', 'step': 1230, 'epoch': 2} {'type': 'loss', 'content': 8.481832628604025e-05, 'timestamp': '2025-09-15 03:17:10.090863', 'step': 1231, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:10.121758', 'step': 1231, 'epoch': 2} {'type': 'loss', 'content': 0.0008766755345277488, 'timestamp': '2025-09-15 03:17:10.146894', 'step': 1232, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:10.177054', 'step': 1232, 'epoch': 2} {'type': 'loss', 'content': 0.00022905074001755565, 'timestamp': '2025-09-15 03:17:10.179338', 'step': 1233, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:10.209814', 'step': 1233, 'epoch': 2} {'type': 'loss', 'content': 0.007411510683596134, 'timestamp': '2025-09-15 03:17:10.213938', 'step': 1234, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:10.247195', 'step': 1234, 'epoch': 2} {'type': 'loss', 'content': 0.0008861946989782155, 'timestamp': '2025-09-15 03:17:10.253766', 'step': 1235, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:10.285336', 'step': 1235, 'epoch': 2} {'type': 'loss', 'content': 4.034994708490558e-05, 'timestamp': '2025-09-15 03:17:10.312785', 'step': 1236, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:10.345919', 'step': 1236, 'epoch': 2} {'type': 'loss', 'content': 0.0019980978686362505, 'timestamp': '2025-09-15 03:17:10.347998', 'step': 1237, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:10.378577', 'step': 1237, 'epoch': 2} {'type': 'loss', 'content': 0.001465094625018537, 'timestamp': '2025-09-15 03:17:10.385492', 'step': 1238, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:10.415967', 'step': 1238, 'epoch': 2} {'type': 'loss', 'content': 0.017953436821699142, 'timestamp': '2025-09-15 03:17:10.418226', 'step': 1239, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:10.450453', 'step': 1239, 'epoch': 2} {'type': 'loss', 'content': 0.0008248678059317172, 'timestamp': '2025-09-15 03:17:10.478465', 'step': 1240, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:10.509760', 'step': 1240, 'epoch': 2} {'type': 'loss', 'content': 0.012165131978690624, 'timestamp': '2025-09-15 03:17:10.511731', 'step': 1241, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:10.542917', 'step': 1241, 'epoch': 2} {'type': 'loss', 'content': 0.0002835580671671778, 'timestamp': '2025-09-15 03:17:10.546925', 'step': 1242, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:10.577593', 'step': 1242, 'epoch': 2} {'type': 'loss', 'content': 0.00029444595566019416, 'timestamp': '2025-09-15 03:17:10.581373', 'step': 1243, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:10.612272', 'step': 1243, 'epoch': 2} {'type': 'loss', 'content': 0.0013534992467612028, 'timestamp': '2025-09-15 03:17:10.636768', 'step': 1244, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:10.668331', 'step': 1244, 'epoch': 2} {'type': 'loss', 'content': 0.00040784329758025706, 'timestamp': '2025-09-15 03:17:10.672322', 'step': 1245, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [2, 192], 'flops': 2847885110400}, 'timestamp': '2025-09-15 03:17:10.703260', 'step': 1245, 'epoch': 2} {'type': 'loss', 'content': 0.00012379752297420055, 'timestamp': '2025-09-15 03:17:10.705681', 'step': 1246, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:10.755385', 'step': 1246, 'epoch': 3} {'type': 'loss', 'content': 0.004293285310268402, 'timestamp': '2025-09-15 03:17:10.758474', 'step': 1247, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:10.789678', 'step': 1247, 'epoch': 3} {'type': 'loss', 'content': 0.002140170196071267, 'timestamp': '2025-09-15 03:17:10.813333', 'step': 1248, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:17:11.402373', 'step': 1248, 'epoch': 3} {'type': 'pplx', 'content': 125033048.54934657, 'timestamp': '2025-09-15 03:17:11.404551', 'step': 1248, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:11.432890', 'step': 1248, 'epoch': 3} {'type': 'loss', 'content': 0.00016366194176953286, 'timestamp': '2025-09-15 03:17:11.435041', 'step': 1249, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:11.465767', 'step': 1249, 'epoch': 3} {'type': 'loss', 'content': 0.0023328284732997417, 'timestamp': '2025-09-15 03:17:11.469528', 'step': 1250, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:17:11.500230', 'step': 1250, 'epoch': 3} {'type': 'loss', 'content': 0.0001446189417038113, 'timestamp': '2025-09-15 03:17:11.507410', 'step': 1251, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:11.537923', 'step': 1251, 'epoch': 3} {'type': 'loss', 'content': 0.0007846828666515648, 'timestamp': '2025-09-15 03:17:11.563174', 'step': 1252, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:11.593624', 'step': 1252, 'epoch': 3} {'type': 'loss', 'content': 0.00610629515722394, 'timestamp': '2025-09-15 03:17:11.595673', 'step': 1253, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:11.625907', 'step': 1253, 'epoch': 3} {'type': 'loss', 'content': 0.03473528474569321, 'timestamp': '2025-09-15 03:17:11.630120', 'step': 1254, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:11.661343', 'step': 1254, 'epoch': 3} {'type': 'loss', 'content': 0.0007956092595122755, 'timestamp': '2025-09-15 03:17:11.663421', 'step': 1255, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:11.693649', 'step': 1255, 'epoch': 3} {'type': 'loss', 'content': 0.0005844756960868835, 'timestamp': '2025-09-15 03:17:11.717118', 'step': 1256, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:17:11.748174', 'step': 1256, 'epoch': 3} {'type': 'loss', 'content': 0.005617126356810331, 'timestamp': '2025-09-15 03:17:11.752873', 'step': 1257, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:11.784850', 'step': 1257, 'epoch': 3} {'type': 'loss', 'content': 0.00012601922207977623, 'timestamp': '2025-09-15 03:17:11.788471', 'step': 1258, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:11.819867', 'step': 1258, 'epoch': 3} {'type': 'loss', 'content': 7.824383646948263e-05, 'timestamp': '2025-09-15 03:17:11.822295', 'step': 1259, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:11.853192', 'step': 1259, 'epoch': 3} {'type': 'loss', 'content': 0.004577727522701025, 'timestamp': '2025-09-15 03:17:11.876651', 'step': 1260, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:11.907607', 'step': 1260, 'epoch': 3} {'type': 'loss', 'content': 0.03704509511590004, 'timestamp': '2025-09-15 03:17:11.909993', 'step': 1261, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:11.941140', 'step': 1261, 'epoch': 3} {'type': 'loss', 'content': 0.002187976147979498, 'timestamp': '2025-09-15 03:17:11.943525', 'step': 1262, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:11.974060', 'step': 1262, 'epoch': 3} {'type': 'loss', 'content': 0.0003167348331771791, 'timestamp': '2025-09-15 03:17:11.976351', 'step': 1263, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:12.006870', 'step': 1263, 'epoch': 3} {'type': 'loss', 'content': 0.00025892173289321363, 'timestamp': '2025-09-15 03:17:12.034443', 'step': 1264, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:12.066523', 'step': 1264, 'epoch': 3} {'type': 'loss', 'content': 0.0006322207627817988, 'timestamp': '2025-09-15 03:17:12.068514', 'step': 1265, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:12.099093', 'step': 1265, 'epoch': 3} {'type': 'loss', 'content': 0.0007802381296642125, 'timestamp': '2025-09-15 03:17:12.106323', 'step': 1266, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:12.138278', 'step': 1266, 'epoch': 3} {'type': 'loss', 'content': 0.0011284631909802556, 'timestamp': '2025-09-15 03:17:12.144637', 'step': 1267, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:12.184579', 'step': 1267, 'epoch': 3} {'type': 'loss', 'content': 0.004695535637438297, 'timestamp': '2025-09-15 03:17:12.208286', 'step': 1268, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:12.239092', 'step': 1268, 'epoch': 3} {'type': 'loss', 'content': 0.02026580460369587, 'timestamp': '2025-09-15 03:17:12.241220', 'step': 1269, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:12.273008', 'step': 1269, 'epoch': 3} {'type': 'loss', 'content': 0.02153332531452179, 'timestamp': '2025-09-15 03:17:12.277407', 'step': 1270, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:12.308163', 'step': 1270, 'epoch': 3} {'type': 'loss', 'content': 0.00013611149915959686, 'timestamp': '2025-09-15 03:17:12.314720', 'step': 1271, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-15 03:17:12.349512', 'step': 1271, 'epoch': 3} {'type': 'loss', 'content': 0.0013938635820522904, 'timestamp': '2025-09-15 03:17:12.377986', 'step': 1272, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:12.411824', 'step': 1272, 'epoch': 3} {'type': 'loss', 'content': 0.021907132118940353, 'timestamp': '2025-09-15 03:17:12.414248', 'step': 1273, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:12.445187', 'step': 1273, 'epoch': 3} {'type': 'loss', 'content': 0.00254437536932528, 'timestamp': '2025-09-15 03:17:12.447166', 'step': 1274, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:12.486922', 'step': 1274, 'epoch': 3} {'type': 'loss', 'content': 0.0045538186095654964, 'timestamp': '2025-09-15 03:17:12.493651', 'step': 1275, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:12.525206', 'step': 1275, 'epoch': 3} {'type': 'loss', 'content': 0.00021196853776928037, 'timestamp': '2025-09-15 03:17:12.549122', 'step': 1276, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:12.581726', 'step': 1276, 'epoch': 3} {'type': 'loss', 'content': 0.00013388384832069278, 'timestamp': '2025-09-15 03:17:12.583993', 'step': 1277, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:12.615311', 'step': 1277, 'epoch': 3} {'type': 'loss', 'content': 0.0018024734454229474, 'timestamp': '2025-09-15 03:17:12.622011', 'step': 1278, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:12.659800', 'step': 1278, 'epoch': 3} {'type': 'loss', 'content': 0.008215748704969883, 'timestamp': '2025-09-15 03:17:12.665937', 'step': 1279, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:12.697138', 'step': 1279, 'epoch': 3} {'type': 'loss', 'content': 0.0039006711449474096, 'timestamp': '2025-09-15 03:17:12.724650', 'step': 1280, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:12.757677', 'step': 1280, 'epoch': 3} {'type': 'loss', 'content': 0.000163158867508173, 'timestamp': '2025-09-15 03:17:12.761202', 'step': 1281, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:12.792137', 'step': 1281, 'epoch': 3} {'type': 'loss', 'content': 0.005478131119161844, 'timestamp': '2025-09-15 03:17:12.794229', 'step': 1282, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:12.824751', 'step': 1282, 'epoch': 3} {'type': 'loss', 'content': 0.00642383610829711, 'timestamp': '2025-09-15 03:17:12.830408', 'step': 1283, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:12.865609', 'step': 1283, 'epoch': 3} {'type': 'loss', 'content': 0.0033451267518103123, 'timestamp': '2025-09-15 03:17:12.889224', 'step': 1284, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:12.919833', 'step': 1284, 'epoch': 3} {'type': 'loss', 'content': 0.015903616324067116, 'timestamp': '2025-09-15 03:17:12.921990', 'step': 1285, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:12.960698', 'step': 1285, 'epoch': 3} {'type': 'loss', 'content': 0.0002753768640104681, 'timestamp': '2025-09-15 03:17:12.964625', 'step': 1286, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:12.995067', 'step': 1286, 'epoch': 3} {'type': 'loss', 'content': 0.0006740019307471812, 'timestamp': '2025-09-15 03:17:13.001635', 'step': 1287, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:17:13.588533', 'step': 1287, 'epoch': 3} {'type': 'pplx', 'content': 117707103.54334375, 'timestamp': '2025-09-15 03:17:13.590451', 'step': 1287, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:13.620322', 'step': 1287, 'epoch': 3} {'type': 'loss', 'content': 0.017900949344038963, 'timestamp': '2025-09-15 03:17:13.644774', 'step': 1288, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:13.676991', 'step': 1288, 'epoch': 3} {'type': 'loss', 'content': 0.00039724045200273395, 'timestamp': '2025-09-15 03:17:13.679062', 'step': 1289, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:13.710333', 'step': 1289, 'epoch': 3} {'type': 'loss', 'content': 0.01001682598143816, 'timestamp': '2025-09-15 03:17:13.716860', 'step': 1290, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:13.748463', 'step': 1290, 'epoch': 3} {'type': 'loss', 'content': 0.0025902953930199146, 'timestamp': '2025-09-15 03:17:13.751489', 'step': 1291, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:13.782639', 'step': 1291, 'epoch': 3} {'type': 'loss', 'content': 0.00179091386962682, 'timestamp': '2025-09-15 03:17:13.807955', 'step': 1292, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:13.838599', 'step': 1292, 'epoch': 3} {'type': 'loss', 'content': 0.0013810384552925825, 'timestamp': '2025-09-15 03:17:13.840951', 'step': 1293, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:13.871492', 'step': 1293, 'epoch': 3} {'type': 'loss', 'content': 0.0022027629893273115, 'timestamp': '2025-09-15 03:17:13.873514', 'step': 1294, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:13.904060', 'step': 1294, 'epoch': 3} {'type': 'loss', 'content': 0.0019874197896569967, 'timestamp': '2025-09-15 03:17:13.906219', 'step': 1295, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:13.936900', 'step': 1295, 'epoch': 3} {'type': 'loss', 'content': 0.0016624435083940625, 'timestamp': '2025-09-15 03:17:13.960697', 'step': 1296, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:13.990556', 'step': 1296, 'epoch': 3} {'type': 'loss', 'content': 0.00470365583896637, 'timestamp': '2025-09-15 03:17:13.992458', 'step': 1297, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:14.023250', 'step': 1297, 'epoch': 3} {'type': 'loss', 'content': 0.00042901348206214607, 'timestamp': '2025-09-15 03:17:14.025272', 'step': 1298, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:14.055612', 'step': 1298, 'epoch': 3} {'type': 'loss', 'content': 0.0024480712600052357, 'timestamp': '2025-09-15 03:17:14.058083', 'step': 1299, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:14.088874', 'step': 1299, 'epoch': 3} {'type': 'loss', 'content': 0.0006906711496412754, 'timestamp': '2025-09-15 03:17:14.112659', 'step': 1300, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:14.143299', 'step': 1300, 'epoch': 3} {'type': 'loss', 'content': 0.009458900429308414, 'timestamp': '2025-09-15 03:17:14.145420', 'step': 1301, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:14.175812', 'step': 1301, 'epoch': 3} {'type': 'loss', 'content': 0.003652880433946848, 'timestamp': '2025-09-15 03:17:14.177795', 'step': 1302, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:14.208538', 'step': 1302, 'epoch': 3} {'type': 'loss', 'content': 0.0004979693330824375, 'timestamp': '2025-09-15 03:17:14.210726', 'step': 1303, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:14.241569', 'step': 1303, 'epoch': 3} {'type': 'loss', 'content': 0.007943389937281609, 'timestamp': '2025-09-15 03:17:14.264988', 'step': 1304, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:14.296595', 'step': 1304, 'epoch': 3} {'type': 'loss', 'content': 0.008471814915537834, 'timestamp': '2025-09-15 03:17:14.298487', 'step': 1305, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:14.328621', 'step': 1305, 'epoch': 3} {'type': 'loss', 'content': 0.03487579524517059, 'timestamp': '2025-09-15 03:17:14.330760', 'step': 1306, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:14.361653', 'step': 1306, 'epoch': 3} {'type': 'loss', 'content': 0.002479907125234604, 'timestamp': '2025-09-15 03:17:14.363872', 'step': 1307, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:14.396651', 'step': 1307, 'epoch': 3} {'type': 'loss', 'content': 0.00047891700523905456, 'timestamp': '2025-09-15 03:17:14.421457', 'step': 1308, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:14.452004', 'step': 1308, 'epoch': 3} {'type': 'loss', 'content': 0.0009700975497253239, 'timestamp': '2025-09-15 03:17:14.454102', 'step': 1309, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:14.484472', 'step': 1309, 'epoch': 3} {'type': 'loss', 'content': 0.0008935186197049916, 'timestamp': '2025-09-15 03:17:14.488647', 'step': 1310, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:14.519384', 'step': 1310, 'epoch': 3} {'type': 'loss', 'content': 0.008795429021120071, 'timestamp': '2025-09-15 03:17:14.522991', 'step': 1311, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:14.554193', 'step': 1311, 'epoch': 3} {'type': 'loss', 'content': 0.006600773893296719, 'timestamp': '2025-09-15 03:17:14.577811', 'step': 1312, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:14.608655', 'step': 1312, 'epoch': 3} {'type': 'loss', 'content': 0.018867747858166695, 'timestamp': '2025-09-15 03:17:14.611002', 'step': 1313, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:14.641810', 'step': 1313, 'epoch': 3} {'type': 'loss', 'content': 0.0008665714995004237, 'timestamp': '2025-09-15 03:17:14.644014', 'step': 1314, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:14.674439', 'step': 1314, 'epoch': 3} {'type': 'loss', 'content': 0.0014345311792567372, 'timestamp': '2025-09-15 03:17:14.678247', 'step': 1315, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:14.710011', 'step': 1315, 'epoch': 3} {'type': 'loss', 'content': 0.0004817911540158093, 'timestamp': '2025-09-15 03:17:14.733548', 'step': 1316, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:14.764297', 'step': 1316, 'epoch': 3} {'type': 'loss', 'content': 0.0016825651982799172, 'timestamp': '2025-09-15 03:17:14.767897', 'step': 1317, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:14.797840', 'step': 1317, 'epoch': 3} {'type': 'loss', 'content': 0.0014796556206420064, 'timestamp': '2025-09-15 03:17:14.799997', 'step': 1318, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:14.830624', 'step': 1318, 'epoch': 3} {'type': 'loss', 'content': 0.00032859708881005645, 'timestamp': '2025-09-15 03:17:14.832744', 'step': 1319, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:14.863545', 'step': 1319, 'epoch': 3} {'type': 'loss', 'content': 0.0011759724002331495, 'timestamp': '2025-09-15 03:17:14.887673', 'step': 1320, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:14.918357', 'step': 1320, 'epoch': 3} {'type': 'loss', 'content': 7.884834485594183e-05, 'timestamp': '2025-09-15 03:17:14.920322', 'step': 1321, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:14.951652', 'step': 1321, 'epoch': 3} {'type': 'loss', 'content': 0.00024336694332305342, 'timestamp': '2025-09-15 03:17:14.955758', 'step': 1322, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:14.986712', 'step': 1322, 'epoch': 3} {'type': 'loss', 'content': 0.0012510953238233924, 'timestamp': '2025-09-15 03:17:14.989087', 'step': 1323, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:15.019814', 'step': 1323, 'epoch': 3} {'type': 'loss', 'content': 0.012558196671307087, 'timestamp': '2025-09-15 03:17:15.047203', 'step': 1324, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:17:15.077861', 'step': 1324, 'epoch': 3} {'type': 'loss', 'content': 0.03249092772603035, 'timestamp': '2025-09-15 03:17:15.082532', 'step': 1325, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:15.113161', 'step': 1325, 'epoch': 3} {'type': 'loss', 'content': 0.0008610652876086533, 'timestamp': '2025-09-15 03:17:15.117034', 'step': 1326, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:17:15.703944', 'step': 1326, 'epoch': 3} {'type': 'pplx', 'content': 114827208.4781151, 'timestamp': '2025-09-15 03:17:15.705866', 'step': 1326, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:15.735595', 'step': 1326, 'epoch': 3} {'type': 'loss', 'content': 0.0011096058879047632, 'timestamp': '2025-09-15 03:17:15.741929', 'step': 1327, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:15.772332', 'step': 1327, 'epoch': 3} {'type': 'loss', 'content': 0.0001657214161241427, 'timestamp': '2025-09-15 03:17:15.797420', 'step': 1328, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:17:15.828295', 'step': 1328, 'epoch': 3} {'type': 'loss', 'content': 0.00018686882685869932, 'timestamp': '2025-09-15 03:17:15.833139', 'step': 1329, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:15.864085', 'step': 1329, 'epoch': 3} {'type': 'loss', 'content': 0.0023887602146714926, 'timestamp': '2025-09-15 03:17:15.867848', 'step': 1330, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:15.899069', 'step': 1330, 'epoch': 3} {'type': 'loss', 'content': 0.00025507682585157454, 'timestamp': '2025-09-15 03:17:15.905779', 'step': 1331, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:15.936741', 'step': 1331, 'epoch': 3} {'type': 'loss', 'content': 0.00046792227658443153, 'timestamp': '2025-09-15 03:17:15.960274', 'step': 1332, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:17:15.990979', 'step': 1332, 'epoch': 3} {'type': 'loss', 'content': 0.0033070999197661877, 'timestamp': '2025-09-15 03:17:15.995891', 'step': 1333, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:16.026181', 'step': 1333, 'epoch': 3} {'type': 'loss', 'content': 0.002882573287934065, 'timestamp': '2025-09-15 03:17:16.028235', 'step': 1334, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:16.059017', 'step': 1334, 'epoch': 3} {'type': 'loss', 'content': 0.0002578755666036159, 'timestamp': '2025-09-15 03:17:16.065682', 'step': 1335, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:16.096267', 'step': 1335, 'epoch': 3} {'type': 'loss', 'content': 0.0017550382763147354, 'timestamp': '2025-09-15 03:17:16.120074', 'step': 1336, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:16.150812', 'step': 1336, 'epoch': 3} {'type': 'loss', 'content': 0.00020929174206685275, 'timestamp': '2025-09-15 03:17:16.157749', 'step': 1337, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:16.190951', 'step': 1337, 'epoch': 3} {'type': 'loss', 'content': 0.0007545395055785775, 'timestamp': '2025-09-15 03:17:16.195321', 'step': 1338, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:16.225885', 'step': 1338, 'epoch': 3} {'type': 'loss', 'content': 0.032332248985767365, 'timestamp': '2025-09-15 03:17:16.229702', 'step': 1339, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:16.259973', 'step': 1339, 'epoch': 3} {'type': 'loss', 'content': 0.005746941082179546, 'timestamp': '2025-09-15 03:17:16.285314', 'step': 1340, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:16.315600', 'step': 1340, 'epoch': 3} {'type': 'loss', 'content': 0.0004919808125123382, 'timestamp': '2025-09-15 03:17:16.317829', 'step': 1341, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:16.348614', 'step': 1341, 'epoch': 3} {'type': 'loss', 'content': 0.0002653584233485162, 'timestamp': '2025-09-15 03:17:16.352545', 'step': 1342, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:16.383219', 'step': 1342, 'epoch': 3} {'type': 'loss', 'content': 0.00044335488928481936, 'timestamp': '2025-09-15 03:17:16.385358', 'step': 1343, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:16.415699', 'step': 1343, 'epoch': 3} {'type': 'loss', 'content': 0.0016609487356618047, 'timestamp': '2025-09-15 03:17:16.439516', 'step': 1344, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:17:16.470661', 'step': 1344, 'epoch': 3} {'type': 'loss', 'content': 0.0016143351094797254, 'timestamp': '2025-09-15 03:17:16.475637', 'step': 1345, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:16.517008', 'step': 1345, 'epoch': 3} {'type': 'loss', 'content': 0.0006745156715624034, 'timestamp': '2025-09-15 03:17:16.519452', 'step': 1346, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:16.549564', 'step': 1346, 'epoch': 3} {'type': 'loss', 'content': 0.0014027913566678762, 'timestamp': '2025-09-15 03:17:16.556047', 'step': 1347, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:16.586376', 'step': 1347, 'epoch': 3} {'type': 'loss', 'content': 0.0003329945320729166, 'timestamp': '2025-09-15 03:17:16.611725', 'step': 1348, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:16.641501', 'step': 1348, 'epoch': 3} {'type': 'loss', 'content': 0.0014508651802316308, 'timestamp': '2025-09-15 03:17:16.643638', 'step': 1349, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:16.674134', 'step': 1349, 'epoch': 3} {'type': 'loss', 'content': 0.0008561424911022186, 'timestamp': '2025-09-15 03:17:16.678392', 'step': 1350, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-15 03:17:16.708629', 'step': 1350, 'epoch': 3} {'type': 'loss', 'content': 0.0001607358717592433, 'timestamp': '2025-09-15 03:17:16.716321', 'step': 1351, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:16.746710', 'step': 1351, 'epoch': 3} {'type': 'loss', 'content': 0.0013879425823688507, 'timestamp': '2025-09-15 03:17:16.770497', 'step': 1352, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:16.802699', 'step': 1352, 'epoch': 3} {'type': 'loss', 'content': 0.00041087184217758477, 'timestamp': '2025-09-15 03:17:16.805119', 'step': 1353, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:16.835324', 'step': 1353, 'epoch': 3} {'type': 'loss', 'content': 0.0003274380578659475, 'timestamp': '2025-09-15 03:17:16.837342', 'step': 1354, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:16.867347', 'step': 1354, 'epoch': 3} {'type': 'loss', 'content': 0.0003536785370670259, 'timestamp': '2025-09-15 03:17:16.869553', 'step': 1355, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:16.900490', 'step': 1355, 'epoch': 3} {'type': 'loss', 'content': 0.007172483950853348, 'timestamp': '2025-09-15 03:17:16.925213', 'step': 1356, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:16.955759', 'step': 1356, 'epoch': 3} {'type': 'loss', 'content': 0.004883793648332357, 'timestamp': '2025-09-15 03:17:16.957832', 'step': 1357, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:16.987871', 'step': 1357, 'epoch': 3} {'type': 'loss', 'content': 0.00017396271869074553, 'timestamp': '2025-09-15 03:17:16.989906', 'step': 1358, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:17.021886', 'step': 1358, 'epoch': 3} {'type': 'loss', 'content': 0.0007931128493510187, 'timestamp': '2025-09-15 03:17:17.025637', 'step': 1359, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:17:17.056290', 'step': 1359, 'epoch': 3} {'type': 'loss', 'content': 0.00537915201857686, 'timestamp': '2025-09-15 03:17:17.084613', 'step': 1360, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:17.115120', 'step': 1360, 'epoch': 3} {'type': 'loss', 'content': 0.0009680739603936672, 'timestamp': '2025-09-15 03:17:17.117300', 'step': 1361, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:17.147655', 'step': 1361, 'epoch': 3} {'type': 'loss', 'content': 0.0007127004791982472, 'timestamp': '2025-09-15 03:17:17.152103', 'step': 1362, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:17.182137', 'step': 1362, 'epoch': 3} {'type': 'loss', 'content': 0.0012851905776187778, 'timestamp': '2025-09-15 03:17:17.184109', 'step': 1363, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:17.214588', 'step': 1363, 'epoch': 3} {'type': 'loss', 'content': 0.0003384167794138193, 'timestamp': '2025-09-15 03:17:17.242119', 'step': 1364, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:17.272477', 'step': 1364, 'epoch': 3} {'type': 'loss', 'content': 0.0007884152000769973, 'timestamp': '2025-09-15 03:17:17.274543', 'step': 1365, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:17:17.856696', 'step': 1365, 'epoch': 3} {'type': 'pplx', 'content': 119068416.19124384, 'timestamp': '2025-09-15 03:17:17.858747', 'step': 1365, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 288], 'flops': 8543129804160}, 'timestamp': '2025-09-15 03:17:17.888098', 'step': 1365, 'epoch': 3} {'type': 'loss', 'content': 0.0002495049557182938, 'timestamp': '2025-09-15 03:17:17.898583', 'step': 1366, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:17.929158', 'step': 1366, 'epoch': 3} {'type': 'loss', 'content': 0.0002520255511626601, 'timestamp': '2025-09-15 03:17:17.933377', 'step': 1367, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:17.963662', 'step': 1367, 'epoch': 3} {'type': 'loss', 'content': 0.00036317037302069366, 'timestamp': '2025-09-15 03:17:17.987417', 'step': 1368, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:18.017858', 'step': 1368, 'epoch': 3} {'type': 'loss', 'content': 0.0032210820354521275, 'timestamp': '2025-09-15 03:17:18.020093', 'step': 1369, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:18.051133', 'step': 1369, 'epoch': 3} {'type': 'loss', 'content': 0.0008396569755859673, 'timestamp': '2025-09-15 03:17:18.057839', 'step': 1370, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:18.088284', 'step': 1370, 'epoch': 3} {'type': 'loss', 'content': 0.002095963107421994, 'timestamp': '2025-09-15 03:17:18.090526', 'step': 1371, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:18.120947', 'step': 1371, 'epoch': 3} {'type': 'loss', 'content': 0.001525636063888669, 'timestamp': '2025-09-15 03:17:18.146237', 'step': 1372, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:18.176471', 'step': 1372, 'epoch': 3} {'type': 'loss', 'content': 0.00012938915460836142, 'timestamp': '2025-09-15 03:17:18.178864', 'step': 1373, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:18.209188', 'step': 1373, 'epoch': 3} {'type': 'loss', 'content': 0.00017872024909593165, 'timestamp': '2025-09-15 03:17:18.211448', 'step': 1374, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:18.243395', 'step': 1374, 'epoch': 3} {'type': 'loss', 'content': 0.0010354757541790605, 'timestamp': '2025-09-15 03:17:18.247754', 'step': 1375, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-15 03:17:18.278609', 'step': 1375, 'epoch': 3} {'type': 'loss', 'content': 0.0005616145208477974, 'timestamp': '2025-09-15 03:17:18.309279', 'step': 1376, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:18.340015', 'step': 1376, 'epoch': 3} {'type': 'loss', 'content': 0.0001830124092521146, 'timestamp': '2025-09-15 03:17:18.342241', 'step': 1377, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:18.372319', 'step': 1377, 'epoch': 3} {'type': 'loss', 'content': 0.0011224261252209544, 'timestamp': '2025-09-15 03:17:18.374380', 'step': 1378, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:18.404930', 'step': 1378, 'epoch': 3} {'type': 'loss', 'content': 0.0008521327981725335, 'timestamp': '2025-09-15 03:17:18.408767', 'step': 1379, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:18.439865', 'step': 1379, 'epoch': 3} {'type': 'loss', 'content': 0.00017294203280471265, 'timestamp': '2025-09-15 03:17:18.467276', 'step': 1380, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:17:18.498338', 'step': 1380, 'epoch': 3} {'type': 'loss', 'content': 0.003641907824203372, 'timestamp': '2025-09-15 03:17:18.503055', 'step': 1381, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:18.533473', 'step': 1381, 'epoch': 3} {'type': 'loss', 'content': 0.00016708121984265745, 'timestamp': '2025-09-15 03:17:18.537813', 'step': 1382, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:18.568192', 'step': 1382, 'epoch': 3} {'type': 'loss', 'content': 0.029590027406811714, 'timestamp': '2025-09-15 03:17:18.570374', 'step': 1383, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:18.600642', 'step': 1383, 'epoch': 3} {'type': 'loss', 'content': 0.00025269907200708985, 'timestamp': '2025-09-15 03:17:18.624692', 'step': 1384, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:18.659121', 'step': 1384, 'epoch': 3} {'type': 'loss', 'content': 0.002378477482125163, 'timestamp': '2025-09-15 03:17:18.662081', 'step': 1385, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:18.693696', 'step': 1385, 'epoch': 3} {'type': 'loss', 'content': 8.785168029135093e-05, 'timestamp': '2025-09-15 03:17:18.696709', 'step': 1386, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:18.727770', 'step': 1386, 'epoch': 3} {'type': 'loss', 'content': 0.0009063719771802425, 'timestamp': '2025-09-15 03:17:18.730435', 'step': 1387, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-15 03:17:18.761287', 'step': 1387, 'epoch': 3} {'type': 'loss', 'content': 0.0010425930377095938, 'timestamp': '2025-09-15 03:17:18.789671', 'step': 1388, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:18.820163', 'step': 1388, 'epoch': 3} {'type': 'loss', 'content': 0.0005211577517911792, 'timestamp': '2025-09-15 03:17:18.822298', 'step': 1389, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:18.852697', 'step': 1389, 'epoch': 3} {'type': 'loss', 'content': 0.001825684099458158, 'timestamp': '2025-09-15 03:17:18.854870', 'step': 1390, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:18.885119', 'step': 1390, 'epoch': 3} {'type': 'loss', 'content': 0.000622618361376226, 'timestamp': '2025-09-15 03:17:18.887079', 'step': 1391, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:18.917409', 'step': 1391, 'epoch': 3} {'type': 'loss', 'content': 0.00014275507419370115, 'timestamp': '2025-09-15 03:17:18.940865', 'step': 1392, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:18.971215', 'step': 1392, 'epoch': 3} {'type': 'loss', 'content': 0.0006256733322516084, 'timestamp': '2025-09-15 03:17:18.973235', 'step': 1393, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:19.003969', 'step': 1393, 'epoch': 3} {'type': 'loss', 'content': 9.841892460826784e-05, 'timestamp': '2025-09-15 03:17:19.007844', 'step': 1394, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:19.038537', 'step': 1394, 'epoch': 3} {'type': 'loss', 'content': 0.0001524169056210667, 'timestamp': '2025-09-15 03:17:19.042402', 'step': 1395, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:19.073659', 'step': 1395, 'epoch': 3} {'type': 'loss', 'content': 0.0001233371876878664, 'timestamp': '2025-09-15 03:17:19.098896', 'step': 1396, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:19.129491', 'step': 1396, 'epoch': 3} {'type': 'loss', 'content': 0.001133440644480288, 'timestamp': '2025-09-15 03:17:19.131592', 'step': 1397, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:19.161991', 'step': 1397, 'epoch': 3} {'type': 'loss', 'content': 0.0007659767870791256, 'timestamp': '2025-09-15 03:17:19.164197', 'step': 1398, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:19.194321', 'step': 1398, 'epoch': 3} {'type': 'loss', 'content': 0.00019041731138713658, 'timestamp': '2025-09-15 03:17:19.196888', 'step': 1399, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:19.227451', 'step': 1399, 'epoch': 3} {'type': 'loss', 'content': 0.0005006372230127454, 'timestamp': '2025-09-15 03:17:19.252419', 'step': 1400, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:19.282650', 'step': 1400, 'epoch': 3} {'type': 'loss', 'content': 0.00273110275156796, 'timestamp': '2025-09-15 03:17:19.284825', 'step': 1401, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:19.315783', 'step': 1401, 'epoch': 3} {'type': 'loss', 'content': 0.00023853448510635644, 'timestamp': '2025-09-15 03:17:19.322259', 'step': 1402, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:19.352756', 'step': 1402, 'epoch': 3} {'type': 'loss', 'content': 0.0010633624624460936, 'timestamp': '2025-09-15 03:17:19.354929', 'step': 1403, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:19.385636', 'step': 1403, 'epoch': 3} {'type': 'loss', 'content': 0.0003287098079454154, 'timestamp': '2025-09-15 03:17:19.410907', 'step': 1404, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:17:19.997404', 'step': 1404, 'epoch': 3} {'type': 'pplx', 'content': 125354850.80082554, 'timestamp': '2025-09-15 03:17:19.999430', 'step': 1404, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:20.027669', 'step': 1404, 'epoch': 3} {'type': 'loss', 'content': 0.002071457216516137, 'timestamp': '2025-09-15 03:17:20.030890', 'step': 1405, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:20.061827', 'step': 1405, 'epoch': 3} {'type': 'loss', 'content': 0.00012812399654649198, 'timestamp': '2025-09-15 03:17:20.068618', 'step': 1406, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:20.099609', 'step': 1406, 'epoch': 3} {'type': 'loss', 'content': 0.0007547488785348833, 'timestamp': '2025-09-15 03:17:20.103533', 'step': 1407, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:20.133573', 'step': 1407, 'epoch': 3} {'type': 'loss', 'content': 9.140604379354045e-05, 'timestamp': '2025-09-15 03:17:20.157184', 'step': 1408, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:17:20.187758', 'step': 1408, 'epoch': 3} {'type': 'loss', 'content': 0.002761248266324401, 'timestamp': '2025-09-15 03:17:20.192625', 'step': 1409, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:20.222838', 'step': 1409, 'epoch': 3} {'type': 'loss', 'content': 0.0002269747928949073, 'timestamp': '2025-09-15 03:17:20.227381', 'step': 1410, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:20.257969', 'step': 1410, 'epoch': 3} {'type': 'loss', 'content': 0.00021279798238538206, 'timestamp': '2025-09-15 03:17:20.260552', 'step': 1411, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:20.290671', 'step': 1411, 'epoch': 3} {'type': 'loss', 'content': 0.001422856585122645, 'timestamp': '2025-09-15 03:17:20.315554', 'step': 1412, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:20.345574', 'step': 1412, 'epoch': 3} {'type': 'loss', 'content': 8.99404039955698e-05, 'timestamp': '2025-09-15 03:17:20.347563', 'step': 1413, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:20.377831', 'step': 1413, 'epoch': 3} {'type': 'loss', 'content': 0.04837150126695633, 'timestamp': '2025-09-15 03:17:20.380351', 'step': 1414, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:17:20.410909', 'step': 1414, 'epoch': 3} {'type': 'loss', 'content': 0.0014580102870240808, 'timestamp': '2025-09-15 03:17:20.418169', 'step': 1415, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:20.449235', 'step': 1415, 'epoch': 3} {'type': 'loss', 'content': 0.02324610762298107, 'timestamp': '2025-09-15 03:17:20.474197', 'step': 1416, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:20.504773', 'step': 1416, 'epoch': 3} {'type': 'loss', 'content': 0.0024212843272835016, 'timestamp': '2025-09-15 03:17:20.506889', 'step': 1417, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:20.537342', 'step': 1417, 'epoch': 3} {'type': 'loss', 'content': 4.8245903599308804e-05, 'timestamp': '2025-09-15 03:17:20.544449', 'step': 1418, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:20.575729', 'step': 1418, 'epoch': 3} {'type': 'loss', 'content': 0.0004255035310052335, 'timestamp': '2025-09-15 03:17:20.579672', 'step': 1419, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:20.609848', 'step': 1419, 'epoch': 3} {'type': 'loss', 'content': 0.00013181402755435556, 'timestamp': '2025-09-15 03:17:20.633286', 'step': 1420, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:20.663715', 'step': 1420, 'epoch': 3} {'type': 'loss', 'content': 0.00017817116167861968, 'timestamp': '2025-09-15 03:17:20.665800', 'step': 1421, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:20.696500', 'step': 1421, 'epoch': 3} {'type': 'loss', 'content': 0.00030947051709517837, 'timestamp': '2025-09-15 03:17:20.698821', 'step': 1422, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:20.730573', 'step': 1422, 'epoch': 3} {'type': 'loss', 'content': 0.0001780502061592415, 'timestamp': '2025-09-15 03:17:20.732659', 'step': 1423, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:20.763220', 'step': 1423, 'epoch': 3} {'type': 'loss', 'content': 8.599001739639789e-05, 'timestamp': '2025-09-15 03:17:20.786767', 'step': 1424, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:20.817233', 'step': 1424, 'epoch': 3} {'type': 'loss', 'content': 0.00014561659190803766, 'timestamp': '2025-09-15 03:17:20.819123', 'step': 1425, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:20.849211', 'step': 1425, 'epoch': 3} {'type': 'loss', 'content': 0.00022120056382846087, 'timestamp': '2025-09-15 03:17:20.851161', 'step': 1426, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:20.881931', 'step': 1426, 'epoch': 3} {'type': 'loss', 'content': 0.00026828559930436313, 'timestamp': '2025-09-15 03:17:20.886010', 'step': 1427, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:17:20.916519', 'step': 1427, 'epoch': 3} {'type': 'loss', 'content': 0.002969352062791586, 'timestamp': '2025-09-15 03:17:20.944556', 'step': 1428, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:20.974854', 'step': 1428, 'epoch': 3} {'type': 'loss', 'content': 0.00018780956452246755, 'timestamp': '2025-09-15 03:17:20.976874', 'step': 1429, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:21.007740', 'step': 1429, 'epoch': 3} {'type': 'loss', 'content': 0.0001267143670702353, 'timestamp': '2025-09-15 03:17:21.010161', 'step': 1430, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:21.040085', 'step': 1430, 'epoch': 3} {'type': 'loss', 'content': 0.001236670301295817, 'timestamp': '2025-09-15 03:17:21.042101', 'step': 1431, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:21.073383', 'step': 1431, 'epoch': 3} {'type': 'loss', 'content': 0.0011406567646190524, 'timestamp': '2025-09-15 03:17:21.098157', 'step': 1432, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:21.128481', 'step': 1432, 'epoch': 3} {'type': 'loss', 'content': 0.0039399052038788795, 'timestamp': '2025-09-15 03:17:21.130508', 'step': 1433, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:21.161138', 'step': 1433, 'epoch': 3} {'type': 'loss', 'content': 0.0001970245357370004, 'timestamp': '2025-09-15 03:17:21.167836', 'step': 1434, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:21.198362', 'step': 1434, 'epoch': 3} {'type': 'loss', 'content': 0.0003503093321342021, 'timestamp': '2025-09-15 03:17:21.202514', 'step': 1435, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:21.232623', 'step': 1435, 'epoch': 3} {'type': 'loss', 'content': 9.151557605946437e-05, 'timestamp': '2025-09-15 03:17:21.257529', 'step': 1436, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:21.287382', 'step': 1436, 'epoch': 3} {'type': 'loss', 'content': 0.00021274581376928836, 'timestamp': '2025-09-15 03:17:21.289446', 'step': 1437, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:21.319495', 'step': 1437, 'epoch': 3} {'type': 'loss', 'content': 0.0023747701197862625, 'timestamp': '2025-09-15 03:17:21.321406', 'step': 1438, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:21.351725', 'step': 1438, 'epoch': 3} {'type': 'loss', 'content': 0.00023102053091861308, 'timestamp': '2025-09-15 03:17:21.354202', 'step': 1439, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:21.384941', 'step': 1439, 'epoch': 3} {'type': 'loss', 'content': 0.0003894390829373151, 'timestamp': '2025-09-15 03:17:21.409765', 'step': 1440, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:21.439986', 'step': 1440, 'epoch': 3} {'type': 'loss', 'content': 5.9962829254800454e-05, 'timestamp': '2025-09-15 03:17:21.441978', 'step': 1441, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:21.473888', 'step': 1441, 'epoch': 3} {'type': 'loss', 'content': 0.011845647357404232, 'timestamp': '2025-09-15 03:17:21.478136', 'step': 1442, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:21.512786', 'step': 1442, 'epoch': 3} {'type': 'loss', 'content': 0.0003542363701853901, 'timestamp': '2025-09-15 03:17:21.517346', 'step': 1443, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:17:22.125497', 'step': 1443, 'epoch': 3} {'type': 'pplx', 'content': 126089176.76405717, 'timestamp': '2025-09-15 03:17:22.127467', 'step': 1443, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:22.156150', 'step': 1443, 'epoch': 3} {'type': 'loss', 'content': 8.075464575085789e-05, 'timestamp': '2025-09-15 03:17:22.186025', 'step': 1444, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:22.216635', 'step': 1444, 'epoch': 3} {'type': 'loss', 'content': 6.827951438026503e-05, 'timestamp': '2025-09-15 03:17:22.223525', 'step': 1445, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:22.254632', 'step': 1445, 'epoch': 3} {'type': 'loss', 'content': 0.00014688474766444415, 'timestamp': '2025-09-15 03:17:22.260996', 'step': 1446, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:22.293658', 'step': 1446, 'epoch': 3} {'type': 'loss', 'content': 0.00030845263972878456, 'timestamp': '2025-09-15 03:17:22.300301', 'step': 1447, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:22.330668', 'step': 1447, 'epoch': 3} {'type': 'loss', 'content': 0.00010615136852720752, 'timestamp': '2025-09-15 03:17:22.355509', 'step': 1448, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:22.385780', 'step': 1448, 'epoch': 3} {'type': 'loss', 'content': 0.0005661295726895332, 'timestamp': '2025-09-15 03:17:22.387759', 'step': 1449, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:22.425443', 'step': 1449, 'epoch': 3} {'type': 'loss', 'content': 0.040414344519376755, 'timestamp': '2025-09-15 03:17:22.427588', 'step': 1450, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:22.457225', 'step': 1450, 'epoch': 3} {'type': 'loss', 'content': 8.917743980418891e-05, 'timestamp': '2025-09-15 03:17:22.459330', 'step': 1451, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:22.491053', 'step': 1451, 'epoch': 3} {'type': 'loss', 'content': 0.0008629755466245115, 'timestamp': '2025-09-15 03:17:22.514575', 'step': 1452, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:22.545058', 'step': 1452, 'epoch': 3} {'type': 'loss', 'content': 0.009162337519228458, 'timestamp': '2025-09-15 03:17:22.547113', 'step': 1453, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:22.578072', 'step': 1453, 'epoch': 3} {'type': 'loss', 'content': 0.002335967728868127, 'timestamp': '2025-09-15 03:17:22.584799', 'step': 1454, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:22.615019', 'step': 1454, 'epoch': 3} {'type': 'loss', 'content': 0.0006352242780849338, 'timestamp': '2025-09-15 03:17:22.617709', 'step': 1455, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:22.648379', 'step': 1455, 'epoch': 3} {'type': 'loss', 'content': 0.0012936294078826904, 'timestamp': '2025-09-15 03:17:22.671970', 'step': 1456, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:22.703783', 'step': 1456, 'epoch': 3} {'type': 'loss', 'content': 0.0026901266537606716, 'timestamp': '2025-09-15 03:17:22.707835', 'step': 1457, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:22.738628', 'step': 1457, 'epoch': 3} {'type': 'loss', 'content': 0.0022749509662389755, 'timestamp': '2025-09-15 03:17:22.741020', 'step': 1458, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:22.771416', 'step': 1458, 'epoch': 3} {'type': 'loss', 'content': 0.0003408204356674105, 'timestamp': '2025-09-15 03:17:22.773491', 'step': 1459, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:22.805075', 'step': 1459, 'epoch': 3} {'type': 'loss', 'content': 0.008805022574961185, 'timestamp': '2025-09-15 03:17:22.830041', 'step': 1460, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:22.860421', 'step': 1460, 'epoch': 3} {'type': 'loss', 'content': 9.171007695840672e-05, 'timestamp': '2025-09-15 03:17:22.862738', 'step': 1461, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:22.893515', 'step': 1461, 'epoch': 3} {'type': 'loss', 'content': 7.348069630097598e-05, 'timestamp': '2025-09-15 03:17:22.895796', 'step': 1462, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:22.926014', 'step': 1462, 'epoch': 3} {'type': 'loss', 'content': 0.00017722553457133472, 'timestamp': '2025-09-15 03:17:22.928111', 'step': 1463, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-15 03:17:22.958789', 'step': 1463, 'epoch': 3} {'type': 'loss', 'content': 0.00032406425452791154, 'timestamp': '2025-09-15 03:17:22.987286', 'step': 1464, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:23.017892', 'step': 1464, 'epoch': 3} {'type': 'loss', 'content': 0.00012766069266945124, 'timestamp': '2025-09-15 03:17:23.020001', 'step': 1465, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-15 03:17:23.050747', 'step': 1465, 'epoch': 3} {'type': 'loss', 'content': 0.00027193533605895936, 'timestamp': '2025-09-15 03:17:23.058292', 'step': 1466, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:23.088702', 'step': 1466, 'epoch': 3} {'type': 'loss', 'content': 5.195228732191026e-05, 'timestamp': '2025-09-15 03:17:23.092907', 'step': 1467, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-15 03:17:23.124010', 'step': 1467, 'epoch': 3} {'type': 'loss', 'content': 0.002318126615136862, 'timestamp': '2025-09-15 03:17:23.155314', 'step': 1468, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:23.186314', 'step': 1468, 'epoch': 3} {'type': 'loss', 'content': 0.0008258981979452074, 'timestamp': '2025-09-15 03:17:23.190907', 'step': 1469, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:23.221516', 'step': 1469, 'epoch': 3} {'type': 'loss', 'content': 0.0005010002059862018, 'timestamp': '2025-09-15 03:17:23.228134', 'step': 1470, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:23.258183', 'step': 1470, 'epoch': 3} {'type': 'loss', 'content': 0.006026547867804766, 'timestamp': '2025-09-15 03:17:23.260299', 'step': 1471, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:23.290785', 'step': 1471, 'epoch': 3} {'type': 'loss', 'content': 0.00020711333490908146, 'timestamp': '2025-09-15 03:17:23.314496', 'step': 1472, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:23.345215', 'step': 1472, 'epoch': 3} {'type': 'loss', 'content': 0.0006892742239870131, 'timestamp': '2025-09-15 03:17:23.347304', 'step': 1473, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:23.377211', 'step': 1473, 'epoch': 3} {'type': 'loss', 'content': 9.77161034825258e-05, 'timestamp': '2025-09-15 03:17:23.379297', 'step': 1474, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:23.410623', 'step': 1474, 'epoch': 3} {'type': 'loss', 'content': 4.4186213926877826e-05, 'timestamp': '2025-09-15 03:17:23.417619', 'step': 1475, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:23.448364', 'step': 1475, 'epoch': 3} {'type': 'loss', 'content': 0.018671514466404915, 'timestamp': '2025-09-15 03:17:23.472111', 'step': 1476, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:23.503974', 'step': 1476, 'epoch': 3} {'type': 'loss', 'content': 0.00743169104680419, 'timestamp': '2025-09-15 03:17:23.505979', 'step': 1477, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:17:23.536380', 'step': 1477, 'epoch': 3} {'type': 'loss', 'content': 0.00011039253149647266, 'timestamp': '2025-09-15 03:17:23.543811', 'step': 1478, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:17:23.574266', 'step': 1478, 'epoch': 3} {'type': 'loss', 'content': 0.0047522238455712795, 'timestamp': '2025-09-15 03:17:23.581681', 'step': 1479, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:23.613471', 'step': 1479, 'epoch': 3} {'type': 'loss', 'content': 0.00016252427303697914, 'timestamp': '2025-09-15 03:17:23.636942', 'step': 1480, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:23.667421', 'step': 1480, 'epoch': 3} {'type': 'loss', 'content': 0.000923981424421072, 'timestamp': '2025-09-15 03:17:23.669863', 'step': 1481, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-15 03:17:23.700883', 'step': 1481, 'epoch': 3} {'type': 'loss', 'content': 0.00042834514169953763, 'timestamp': '2025-09-15 03:17:23.708462', 'step': 1482, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:17:24.297591', 'step': 1482, 'epoch': 3} {'type': 'pplx', 'content': 126383143.71461472, 'timestamp': '2025-09-15 03:17:24.299474', 'step': 1482, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:24.328575', 'step': 1482, 'epoch': 3} {'type': 'loss', 'content': 0.0013859715545549989, 'timestamp': '2025-09-15 03:17:24.332626', 'step': 1483, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:24.363124', 'step': 1483, 'epoch': 3} {'type': 'loss', 'content': 0.0002539039996918291, 'timestamp': '2025-09-15 03:17:24.388304', 'step': 1484, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:24.418678', 'step': 1484, 'epoch': 3} {'type': 'loss', 'content': 0.018063468858599663, 'timestamp': '2025-09-15 03:17:24.420936', 'step': 1485, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:24.452061', 'step': 1485, 'epoch': 3} {'type': 'loss', 'content': 0.00024127533833961934, 'timestamp': '2025-09-15 03:17:24.456417', 'step': 1486, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:24.487198', 'step': 1486, 'epoch': 3} {'type': 'loss', 'content': 0.00022271994384936988, 'timestamp': '2025-09-15 03:17:24.494067', 'step': 1487, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:24.524540', 'step': 1487, 'epoch': 3} {'type': 'loss', 'content': 0.00039580193697474897, 'timestamp': '2025-09-15 03:17:24.549530', 'step': 1488, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:24.580906', 'step': 1488, 'epoch': 3} {'type': 'loss', 'content': 0.007025205530226231, 'timestamp': '2025-09-15 03:17:24.585235', 'step': 1489, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:24.616611', 'step': 1489, 'epoch': 3} {'type': 'loss', 'content': 0.0018606951925903559, 'timestamp': '2025-09-15 03:17:24.621126', 'step': 1490, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:24.653171', 'step': 1490, 'epoch': 3} {'type': 'loss', 'content': 0.0014626128831878304, 'timestamp': '2025-09-15 03:17:24.659765', 'step': 1491, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:17:24.691466', 'step': 1491, 'epoch': 3} {'type': 'loss', 'content': 0.00030919170239940286, 'timestamp': '2025-09-15 03:17:24.719695', 'step': 1492, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:24.751431', 'step': 1492, 'epoch': 3} {'type': 'loss', 'content': 7.744396134512499e-05, 'timestamp': '2025-09-15 03:17:24.753810', 'step': 1493, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:24.785352', 'step': 1493, 'epoch': 3} {'type': 'loss', 'content': 0.005347420461475849, 'timestamp': '2025-09-15 03:17:24.789035', 'step': 1494, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:24.827759', 'step': 1494, 'epoch': 3} {'type': 'loss', 'content': 3.8123798731248826e-05, 'timestamp': '2025-09-15 03:17:24.833829', 'step': 1495, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:24.863895', 'step': 1495, 'epoch': 3} {'type': 'loss', 'content': 5.30069628439378e-05, 'timestamp': '2025-09-15 03:17:24.887515', 'step': 1496, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:24.918993', 'step': 1496, 'epoch': 3} {'type': 'loss', 'content': 8.05064119049348e-05, 'timestamp': '2025-09-15 03:17:24.921006', 'step': 1497, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:24.951675', 'step': 1497, 'epoch': 3} {'type': 'loss', 'content': 0.0006126973894424736, 'timestamp': '2025-09-15 03:17:24.953857', 'step': 1498, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:24.985111', 'step': 1498, 'epoch': 3} {'type': 'loss', 'content': 0.0006413686205632985, 'timestamp': '2025-09-15 03:17:24.987174', 'step': 1499, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:25.017442', 'step': 1499, 'epoch': 3} {'type': 'loss', 'content': 0.00629863515496254, 'timestamp': '2025-09-15 03:17:25.041021', 'step': 1500, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 1500', 'timestamp': '2025-09-15 03:17:31.238254', 'step': 1500, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:31.273356', 'step': 1500, 'epoch': 3} {'type': 'loss', 'content': 0.020123621448874474, 'timestamp': '2025-09-15 03:17:31.275565', 'step': 1501, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:31.306674', 'step': 1501, 'epoch': 3} {'type': 'loss', 'content': 0.0001477115583838895, 'timestamp': '2025-09-15 03:17:31.308707', 'step': 1502, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:31.339513', 'step': 1502, 'epoch': 3} {'type': 'loss', 'content': 0.00033375757629983127, 'timestamp': '2025-09-15 03:17:31.345785', 'step': 1503, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-15 03:17:31.377326', 'step': 1503, 'epoch': 3} {'type': 'loss', 'content': 4.395996074890718e-05, 'timestamp': '2025-09-15 03:17:31.405520', 'step': 1504, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:31.436569', 'step': 1504, 'epoch': 3} {'type': 'loss', 'content': 0.008604790084064007, 'timestamp': '2025-09-15 03:17:31.438631', 'step': 1505, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:31.468975', 'step': 1505, 'epoch': 3} {'type': 'loss', 'content': 0.00032148772152140737, 'timestamp': '2025-09-15 03:17:31.475518', 'step': 1506, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:31.507445', 'step': 1506, 'epoch': 3} {'type': 'loss', 'content': 0.00021968342480249703, 'timestamp': '2025-09-15 03:17:31.509731', 'step': 1507, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:31.540464', 'step': 1507, 'epoch': 3} {'type': 'loss', 'content': 0.0002701000776141882, 'timestamp': '2025-09-15 03:17:31.564206', 'step': 1508, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:31.594640', 'step': 1508, 'epoch': 3} {'type': 'loss', 'content': 0.00018474875832907856, 'timestamp': '2025-09-15 03:17:31.596720', 'step': 1509, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:31.626769', 'step': 1509, 'epoch': 3} {'type': 'loss', 'content': 0.00023964645515661687, 'timestamp': '2025-09-15 03:17:31.629305', 'step': 1510, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:31.659628', 'step': 1510, 'epoch': 3} {'type': 'loss', 'content': 0.00017940135148819536, 'timestamp': '2025-09-15 03:17:31.661917', 'step': 1511, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:31.692443', 'step': 1511, 'epoch': 3} {'type': 'loss', 'content': 0.0011125266319140792, 'timestamp': '2025-09-15 03:17:31.716053', 'step': 1512, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:31.747264', 'step': 1512, 'epoch': 3} {'type': 'loss', 'content': 4.7944111429387704e-05, 'timestamp': '2025-09-15 03:17:31.749380', 'step': 1513, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:31.779733', 'step': 1513, 'epoch': 3} {'type': 'loss', 'content': 0.0005373401218093932, 'timestamp': '2025-09-15 03:17:31.782096', 'step': 1514, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:31.812540', 'step': 1514, 'epoch': 3} {'type': 'loss', 'content': 0.00032914456096477807, 'timestamp': '2025-09-15 03:17:31.816560', 'step': 1515, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:31.847803', 'step': 1515, 'epoch': 3} {'type': 'loss', 'content': 5.4149852076079696e-05, 'timestamp': '2025-09-15 03:17:31.871442', 'step': 1516, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:31.902172', 'step': 1516, 'epoch': 3} {'type': 'loss', 'content': 0.001020874478854239, 'timestamp': '2025-09-15 03:17:31.904196', 'step': 1517, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:31.936442', 'step': 1517, 'epoch': 3} {'type': 'loss', 'content': 0.0020698870066553354, 'timestamp': '2025-09-15 03:17:31.942997', 'step': 1518, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:17:31.975035', 'step': 1518, 'epoch': 3} {'type': 'loss', 'content': 0.0002697810996323824, 'timestamp': '2025-09-15 03:17:31.982535', 'step': 1519, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:32.013987', 'step': 1519, 'epoch': 3} {'type': 'loss', 'content': 6.943456537555903e-05, 'timestamp': '2025-09-15 03:17:32.041704', 'step': 1520, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:32.072340', 'step': 1520, 'epoch': 3} {'type': 'loss', 'content': 0.00020672274695243686, 'timestamp': '2025-09-15 03:17:32.074605', 'step': 1521, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:17:32.663987', 'step': 1521, 'epoch': 3} {'type': 'pplx', 'content': 125235580.84444861, 'timestamp': '2025-09-15 03:17:32.665675', 'step': 1521, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-15 03:17:32.695624', 'step': 1521, 'epoch': 3} {'type': 'loss', 'content': 5.122490256326273e-05, 'timestamp': '2025-09-15 03:17:32.704972', 'step': 1522, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:32.736033', 'step': 1522, 'epoch': 3} {'type': 'loss', 'content': 0.00010624745482346043, 'timestamp': '2025-09-15 03:17:32.738523', 'step': 1523, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:17:32.771574', 'step': 1523, 'epoch': 3} {'type': 'loss', 'content': 0.00021359531092457473, 'timestamp': '2025-09-15 03:17:32.799726', 'step': 1524, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:32.831616', 'step': 1524, 'epoch': 3} {'type': 'loss', 'content': 0.0006508774240501225, 'timestamp': '2025-09-15 03:17:32.833852', 'step': 1525, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:32.865074', 'step': 1525, 'epoch': 3} {'type': 'loss', 'content': 0.0001420353801222518, 'timestamp': '2025-09-15 03:17:32.868944', 'step': 1526, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:32.899428', 'step': 1526, 'epoch': 3} {'type': 'loss', 'content': 0.0002659276651684195, 'timestamp': '2025-09-15 03:17:32.903686', 'step': 1527, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:32.934591', 'step': 1527, 'epoch': 3} {'type': 'loss', 'content': 0.0002821832022164017, 'timestamp': '2025-09-15 03:17:32.958272', 'step': 1528, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-15 03:17:32.988858', 'step': 1528, 'epoch': 3} {'type': 'loss', 'content': 0.0001353594707325101, 'timestamp': '2025-09-15 03:17:32.994163', 'step': 1529, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-15 03:17:33.025699', 'step': 1529, 'epoch': 3} {'type': 'loss', 'content': 0.00012022336886730045, 'timestamp': '2025-09-15 03:17:33.037465', 'step': 1530, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:33.068693', 'step': 1530, 'epoch': 3} {'type': 'loss', 'content': 8.694497955730185e-05, 'timestamp': '2025-09-15 03:17:33.072782', 'step': 1531, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:33.104407', 'step': 1531, 'epoch': 3} {'type': 'loss', 'content': 0.0010885964147746563, 'timestamp': '2025-09-15 03:17:33.131984', 'step': 1532, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:33.162936', 'step': 1532, 'epoch': 3} {'type': 'loss', 'content': 9.456718544242904e-05, 'timestamp': '2025-09-15 03:17:33.167019', 'step': 1533, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:33.198015', 'step': 1533, 'epoch': 3} {'type': 'loss', 'content': 7.467066461686045e-05, 'timestamp': '2025-09-15 03:17:33.201926', 'step': 1534, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:33.234017', 'step': 1534, 'epoch': 3} {'type': 'loss', 'content': 0.00028563832165673375, 'timestamp': '2025-09-15 03:17:33.237925', 'step': 1535, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:33.268366', 'step': 1535, 'epoch': 3} {'type': 'loss', 'content': 0.0002187574136769399, 'timestamp': '2025-09-15 03:17:33.293694', 'step': 1536, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:33.324069', 'step': 1536, 'epoch': 3} {'type': 'loss', 'content': 0.00011100248229922727, 'timestamp': '2025-09-15 03:17:33.326344', 'step': 1537, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:33.358555', 'step': 1537, 'epoch': 3} {'type': 'loss', 'content': 8.833393076201901e-05, 'timestamp': '2025-09-15 03:17:33.361330', 'step': 1538, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:33.391782', 'step': 1538, 'epoch': 3} {'type': 'loss', 'content': 0.0005231335526332259, 'timestamp': '2025-09-15 03:17:33.394097', 'step': 1539, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:33.424023', 'step': 1539, 'epoch': 3} {'type': 'loss', 'content': 8.783872181084007e-05, 'timestamp': '2025-09-15 03:17:33.447675', 'step': 1540, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:33.478658', 'step': 1540, 'epoch': 3} {'type': 'loss', 'content': 4.987287684343755e-05, 'timestamp': '2025-09-15 03:17:33.482716', 'step': 1541, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:33.512866', 'step': 1541, 'epoch': 3} {'type': 'loss', 'content': 0.0007524410611949861, 'timestamp': '2025-09-15 03:17:33.514978', 'step': 1542, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:33.545420', 'step': 1542, 'epoch': 3} {'type': 'loss', 'content': 0.00011115191591670737, 'timestamp': '2025-09-15 03:17:33.547457', 'step': 1543, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:33.577893', 'step': 1543, 'epoch': 3} {'type': 'loss', 'content': 9.302215039497241e-05, 'timestamp': '2025-09-15 03:17:33.602913', 'step': 1544, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:33.634182', 'step': 1544, 'epoch': 3} {'type': 'loss', 'content': 4.7555680794175714e-05, 'timestamp': '2025-09-15 03:17:33.636238', 'step': 1545, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:33.666896', 'step': 1545, 'epoch': 3} {'type': 'loss', 'content': 0.0001348310906905681, 'timestamp': '2025-09-15 03:17:33.673475', 'step': 1546, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:33.704241', 'step': 1546, 'epoch': 3} {'type': 'loss', 'content': 0.00013676950766239315, 'timestamp': '2025-09-15 03:17:33.708405', 'step': 1547, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:33.739109', 'step': 1547, 'epoch': 3} {'type': 'loss', 'content': 0.001560694188810885, 'timestamp': '2025-09-15 03:17:33.763827', 'step': 1548, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:33.794886', 'step': 1548, 'epoch': 3} {'type': 'loss', 'content': 0.003178370650857687, 'timestamp': '2025-09-15 03:17:33.796961', 'step': 1549, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:33.827609', 'step': 1549, 'epoch': 3} {'type': 'loss', 'content': 9.050655353348702e-05, 'timestamp': '2025-09-15 03:17:33.831261', 'step': 1550, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:33.863106', 'step': 1550, 'epoch': 3} {'type': 'loss', 'content': 8.560741844121367e-05, 'timestamp': '2025-09-15 03:17:33.865271', 'step': 1551, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:33.896231', 'step': 1551, 'epoch': 3} {'type': 'loss', 'content': 3.5383240174269304e-05, 'timestamp': '2025-09-15 03:17:33.923540', 'step': 1552, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:33.954491', 'step': 1552, 'epoch': 3} {'type': 'loss', 'content': 0.00016244736616499722, 'timestamp': '2025-09-15 03:17:33.958507', 'step': 1553, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:33.989745', 'step': 1553, 'epoch': 3} {'type': 'loss', 'content': 6.825554737588391e-05, 'timestamp': '2025-09-15 03:17:33.996237', 'step': 1554, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:34.027115', 'step': 1554, 'epoch': 3} {'type': 'loss', 'content': 6.661610677838326e-05, 'timestamp': '2025-09-15 03:17:34.030857', 'step': 1555, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:34.061911', 'step': 1555, 'epoch': 3} {'type': 'loss', 'content': 0.0005459814565256238, 'timestamp': '2025-09-15 03:17:34.089428', 'step': 1556, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:34.120076', 'step': 1556, 'epoch': 3} {'type': 'loss', 'content': 9.487460920354351e-05, 'timestamp': '2025-09-15 03:17:34.122130', 'step': 1557, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:34.152569', 'step': 1557, 'epoch': 3} {'type': 'loss', 'content': 0.0005464908317662776, 'timestamp': '2025-09-15 03:17:34.154995', 'step': 1558, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:34.190420', 'step': 1558, 'epoch': 3} {'type': 'loss', 'content': 0.0005193923716433346, 'timestamp': '2025-09-15 03:17:34.193712', 'step': 1559, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:34.230142', 'step': 1559, 'epoch': 3} {'type': 'loss', 'content': 0.00015628755500074476, 'timestamp': '2025-09-15 03:17:34.253828', 'step': 1560, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:17:34.844433', 'step': 1560, 'epoch': 3} {'type': 'pplx', 'content': 127046711.74826264, 'timestamp': '2025-09-15 03:17:34.846766', 'step': 1560, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:34.875719', 'step': 1560, 'epoch': 3} {'type': 'loss', 'content': 0.00029895914485678077, 'timestamp': '2025-09-15 03:17:34.877717', 'step': 1561, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:34.908207', 'step': 1561, 'epoch': 3} {'type': 'loss', 'content': 0.000372684356989339, 'timestamp': '2025-09-15 03:17:34.912332', 'step': 1562, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:34.942547', 'step': 1562, 'epoch': 3} {'type': 'loss', 'content': 0.00044768728548660874, 'timestamp': '2025-09-15 03:17:34.944641', 'step': 1563, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:34.975107', 'step': 1563, 'epoch': 3} {'type': 'loss', 'content': 0.0016328077763319016, 'timestamp': '2025-09-15 03:17:34.998711', 'step': 1564, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:35.028816', 'step': 1564, 'epoch': 3} {'type': 'loss', 'content': 0.06172548979520798, 'timestamp': '2025-09-15 03:17:35.031046', 'step': 1565, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:35.061231', 'step': 1565, 'epoch': 3} {'type': 'loss', 'content': 8.873850310919806e-05, 'timestamp': '2025-09-15 03:17:35.063414', 'step': 1566, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:35.093886', 'step': 1566, 'epoch': 3} {'type': 'loss', 'content': 0.0033889301121234894, 'timestamp': '2025-09-15 03:17:35.097818', 'step': 1567, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:17:35.130488', 'step': 1567, 'epoch': 3} {'type': 'loss', 'content': 0.00027142881299369037, 'timestamp': '2025-09-15 03:17:35.158736', 'step': 1568, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:35.189673', 'step': 1568, 'epoch': 3} {'type': 'loss', 'content': 0.0022615992929786444, 'timestamp': '2025-09-15 03:17:35.191772', 'step': 1569, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:35.222456', 'step': 1569, 'epoch': 3} {'type': 'loss', 'content': 0.000650630914606154, 'timestamp': '2025-09-15 03:17:35.226391', 'step': 1570, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:35.257331', 'step': 1570, 'epoch': 3} {'type': 'loss', 'content': 5.338179835234769e-05, 'timestamp': '2025-09-15 03:17:35.261510', 'step': 1571, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:35.292670', 'step': 1571, 'epoch': 3} {'type': 'loss', 'content': 3.176413520122878e-05, 'timestamp': '2025-09-15 03:17:35.317451', 'step': 1572, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:35.347856', 'step': 1572, 'epoch': 3} {'type': 'loss', 'content': 3.908006692654453e-05, 'timestamp': '2025-09-15 03:17:35.350192', 'step': 1573, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:35.380146', 'step': 1573, 'epoch': 3} {'type': 'loss', 'content': 3.330052277306095e-05, 'timestamp': '2025-09-15 03:17:35.382355', 'step': 1574, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:17:35.412864', 'step': 1574, 'epoch': 3} {'type': 'loss', 'content': 4.1809329559328035e-05, 'timestamp': '2025-09-15 03:17:35.420352', 'step': 1575, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:35.450831', 'step': 1575, 'epoch': 3} {'type': 'loss', 'content': 0.00010141608800040558, 'timestamp': '2025-09-15 03:17:35.474429', 'step': 1576, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:35.504856', 'step': 1576, 'epoch': 3} {'type': 'loss', 'content': 0.0002838522777892649, 'timestamp': '2025-09-15 03:17:35.506933', 'step': 1577, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:35.537208', 'step': 1577, 'epoch': 3} {'type': 'loss', 'content': 0.00032633551745675504, 'timestamp': '2025-09-15 03:17:35.539265', 'step': 1578, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:35.570627', 'step': 1578, 'epoch': 3} {'type': 'loss', 'content': 6.84321130393073e-05, 'timestamp': '2025-09-15 03:17:35.577585', 'step': 1579, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:35.608101', 'step': 1579, 'epoch': 3} {'type': 'loss', 'content': 0.0007345884805545211, 'timestamp': '2025-09-15 03:17:35.631677', 'step': 1580, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:35.662201', 'step': 1580, 'epoch': 3} {'type': 'loss', 'content': 0.0004944135434925556, 'timestamp': '2025-09-15 03:17:35.664209', 'step': 1581, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:35.694173', 'step': 1581, 'epoch': 3} {'type': 'loss', 'content': 6.49203357170336e-05, 'timestamp': '2025-09-15 03:17:35.696214', 'step': 1582, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:35.726693', 'step': 1582, 'epoch': 3} {'type': 'loss', 'content': 3.8760634197387844e-05, 'timestamp': '2025-09-15 03:17:35.730028', 'step': 1583, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:35.760249', 'step': 1583, 'epoch': 3} {'type': 'loss', 'content': 0.00014414360339287668, 'timestamp': '2025-09-15 03:17:35.783847', 'step': 1584, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:35.814239', 'step': 1584, 'epoch': 3} {'type': 'loss', 'content': 0.00013764832692686468, 'timestamp': '2025-09-15 03:17:35.816187', 'step': 1585, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:35.846022', 'step': 1585, 'epoch': 3} {'type': 'loss', 'content': 6.031978045939468e-05, 'timestamp': '2025-09-15 03:17:35.848592', 'step': 1586, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:35.880434', 'step': 1586, 'epoch': 3} {'type': 'loss', 'content': 5.813813186250627e-05, 'timestamp': '2025-09-15 03:17:35.887020', 'step': 1587, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:35.916803', 'step': 1587, 'epoch': 3} {'type': 'loss', 'content': 8.35294122225605e-05, 'timestamp': '2025-09-15 03:17:35.940522', 'step': 1588, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:35.971881', 'step': 1588, 'epoch': 3} {'type': 'loss', 'content': 0.0017162241274490952, 'timestamp': '2025-09-15 03:17:35.973874', 'step': 1589, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:36.004162', 'step': 1589, 'epoch': 3} {'type': 'loss', 'content': 0.016213053837418556, 'timestamp': '2025-09-15 03:17:36.006677', 'step': 1590, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:36.038240', 'step': 1590, 'epoch': 3} {'type': 'loss', 'content': 5.837536446051672e-05, 'timestamp': '2025-09-15 03:17:36.044716', 'step': 1591, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:36.074986', 'step': 1591, 'epoch': 3} {'type': 'loss', 'content': 0.0015365114668384194, 'timestamp': '2025-09-15 03:17:36.099835', 'step': 1592, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:36.130061', 'step': 1592, 'epoch': 3} {'type': 'loss', 'content': 0.0001333472173428163, 'timestamp': '2025-09-15 03:17:36.133005', 'step': 1593, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:36.165253', 'step': 1593, 'epoch': 3} {'type': 'loss', 'content': 2.8431244572857395e-05, 'timestamp': '2025-09-15 03:17:36.169276', 'step': 1594, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:36.199455', 'step': 1594, 'epoch': 3} {'type': 'loss', 'content': 4.7248369810404256e-05, 'timestamp': '2025-09-15 03:17:36.201498', 'step': 1595, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:36.232692', 'step': 1595, 'epoch': 3} {'type': 'loss', 'content': 7.492154691135511e-05, 'timestamp': '2025-09-15 03:17:36.260500', 'step': 1596, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:36.290677', 'step': 1596, 'epoch': 3} {'type': 'loss', 'content': 6.914538244018331e-05, 'timestamp': '2025-09-15 03:17:36.292803', 'step': 1597, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:36.323646', 'step': 1597, 'epoch': 3} {'type': 'loss', 'content': 0.0001433978322893381, 'timestamp': '2025-09-15 03:17:36.330323', 'step': 1598, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:36.360701', 'step': 1598, 'epoch': 3} {'type': 'loss', 'content': 0.0006194100715219975, 'timestamp': '2025-09-15 03:17:36.362701', 'step': 1599, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:17:36.957874', 'step': 1599, 'epoch': 3} {'type': 'pplx', 'content': 122925884.74202192, 'timestamp': '2025-09-15 03:17:36.959624', 'step': 1599, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:36.988130', 'step': 1599, 'epoch': 3} {'type': 'loss', 'content': 4.66671226604376e-05, 'timestamp': '2025-09-15 03:17:37.011830', 'step': 1600, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:37.043243', 'step': 1600, 'epoch': 3} {'type': 'loss', 'content': 0.00011773057485697791, 'timestamp': '2025-09-15 03:17:37.046501', 'step': 1601, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:37.084855', 'step': 1601, 'epoch': 3} {'type': 'loss', 'content': 0.001495333737693727, 'timestamp': '2025-09-15 03:17:37.094864', 'step': 1602, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:37.126743', 'step': 1602, 'epoch': 3} {'type': 'loss', 'content': 0.00011041124525945634, 'timestamp': '2025-09-15 03:17:37.129126', 'step': 1603, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:37.159428', 'step': 1603, 'epoch': 3} {'type': 'loss', 'content': 5.4903135605854914e-05, 'timestamp': '2025-09-15 03:17:37.183095', 'step': 1604, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:37.213306', 'step': 1604, 'epoch': 3} {'type': 'loss', 'content': 9.56984149524942e-05, 'timestamp': '2025-09-15 03:17:37.216111', 'step': 1605, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-15 03:17:37.247474', 'step': 1605, 'epoch': 3} {'type': 'loss', 'content': 2.949278859887272e-05, 'timestamp': '2025-09-15 03:17:37.254999', 'step': 1606, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:37.288946', 'step': 1606, 'epoch': 3} {'type': 'loss', 'content': 0.00021853976068086922, 'timestamp': '2025-09-15 03:17:37.291447', 'step': 1607, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:37.330739', 'step': 1607, 'epoch': 3} {'type': 'loss', 'content': 0.00010619118984322995, 'timestamp': '2025-09-15 03:17:37.361134', 'step': 1608, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:37.392648', 'step': 1608, 'epoch': 3} {'type': 'loss', 'content': 0.0009482751484028995, 'timestamp': '2025-09-15 03:17:37.398185', 'step': 1609, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:37.438540', 'step': 1609, 'epoch': 3} {'type': 'loss', 'content': 0.01618199609220028, 'timestamp': '2025-09-15 03:17:37.440563', 'step': 1610, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:37.471787', 'step': 1610, 'epoch': 3} {'type': 'loss', 'content': 3.588308027246967e-05, 'timestamp': '2025-09-15 03:17:37.474084', 'step': 1611, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:37.506528', 'step': 1611, 'epoch': 3} {'type': 'loss', 'content': 4.4485444959718734e-05, 'timestamp': '2025-09-15 03:17:37.529941', 'step': 1612, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:37.560595', 'step': 1612, 'epoch': 3} {'type': 'loss', 'content': 0.00016524593229405582, 'timestamp': '2025-09-15 03:17:37.562649', 'step': 1613, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:37.592853', 'step': 1613, 'epoch': 3} {'type': 'loss', 'content': 0.00020557189418468624, 'timestamp': '2025-09-15 03:17:37.597162', 'step': 1614, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:37.627512', 'step': 1614, 'epoch': 3} {'type': 'loss', 'content': 0.002822331851348281, 'timestamp': '2025-09-15 03:17:37.629930', 'step': 1615, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:37.660540', 'step': 1615, 'epoch': 3} {'type': 'loss', 'content': 0.00014614405517932028, 'timestamp': '2025-09-15 03:17:37.685416', 'step': 1616, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:37.715651', 'step': 1616, 'epoch': 3} {'type': 'loss', 'content': 0.0031574785243719816, 'timestamp': '2025-09-15 03:17:37.718353', 'step': 1617, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:37.748758', 'step': 1617, 'epoch': 3} {'type': 'loss', 'content': 3.237018609070219e-05, 'timestamp': '2025-09-15 03:17:37.750827', 'step': 1618, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:37.780957', 'step': 1618, 'epoch': 3} {'type': 'loss', 'content': 0.00010914222366409376, 'timestamp': '2025-09-15 03:17:37.782704', 'step': 1619, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:37.812760', 'step': 1619, 'epoch': 3} {'type': 'loss', 'content': 2.9762662961729802e-05, 'timestamp': '2025-09-15 03:17:37.836095', 'step': 1620, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:37.865983', 'step': 1620, 'epoch': 3} {'type': 'loss', 'content': 0.0014514749636873603, 'timestamp': '2025-09-15 03:17:37.868050', 'step': 1621, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:37.898253', 'step': 1621, 'epoch': 3} {'type': 'loss', 'content': 0.004247348755598068, 'timestamp': '2025-09-15 03:17:37.904837', 'step': 1622, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:37.936106', 'step': 1622, 'epoch': 3} {'type': 'loss', 'content': 0.00014622985327150673, 'timestamp': '2025-09-15 03:17:37.940361', 'step': 1623, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:17:37.972645', 'step': 1623, 'epoch': 3} {'type': 'loss', 'content': 0.0014752390561625361, 'timestamp': '2025-09-15 03:17:38.000748', 'step': 1624, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:38.031819', 'step': 1624, 'epoch': 3} {'type': 'loss', 'content': 0.0016322402516379952, 'timestamp': '2025-09-15 03:17:38.033832', 'step': 1625, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:38.064419', 'step': 1625, 'epoch': 3} {'type': 'loss', 'content': 0.0025306600145995617, 'timestamp': '2025-09-15 03:17:38.066466', 'step': 1626, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:38.098091', 'step': 1626, 'epoch': 3} {'type': 'loss', 'content': 0.00017592271615285426, 'timestamp': '2025-09-15 03:17:38.100110', 'step': 1627, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:38.130791', 'step': 1627, 'epoch': 3} {'type': 'loss', 'content': 0.00015544805501122028, 'timestamp': '2025-09-15 03:17:38.158302', 'step': 1628, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:38.190106', 'step': 1628, 'epoch': 3} {'type': 'loss', 'content': 0.00015891435032244772, 'timestamp': '2025-09-15 03:17:38.194633', 'step': 1629, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:38.225390', 'step': 1629, 'epoch': 3} {'type': 'loss', 'content': 0.00015892648661974818, 'timestamp': '2025-09-15 03:17:38.232185', 'step': 1630, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:38.262894', 'step': 1630, 'epoch': 3} {'type': 'loss', 'content': 0.00010154654592042789, 'timestamp': '2025-09-15 03:17:38.264856', 'step': 1631, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:17:38.295288', 'step': 1631, 'epoch': 3} {'type': 'loss', 'content': 5.215726196183823e-05, 'timestamp': '2025-09-15 03:17:38.323422', 'step': 1632, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:38.353612', 'step': 1632, 'epoch': 3} {'type': 'loss', 'content': 9.126593067776412e-05, 'timestamp': '2025-09-15 03:17:38.355599', 'step': 1633, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:38.385887', 'step': 1633, 'epoch': 3} {'type': 'loss', 'content': 0.0001367527002003044, 'timestamp': '2025-09-15 03:17:38.388311', 'step': 1634, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:38.418447', 'step': 1634, 'epoch': 3} {'type': 'loss', 'content': 7.14808120392263e-05, 'timestamp': '2025-09-15 03:17:38.420579', 'step': 1635, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:38.451590', 'step': 1635, 'epoch': 3} {'type': 'loss', 'content': 9.053764370037243e-05, 'timestamp': '2025-09-15 03:17:38.478913', 'step': 1636, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:38.508769', 'step': 1636, 'epoch': 3} {'type': 'loss', 'content': 0.000221226378926076, 'timestamp': '2025-09-15 03:17:38.510717', 'step': 1637, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:38.541352', 'step': 1637, 'epoch': 3} {'type': 'loss', 'content': 0.00017531040066387504, 'timestamp': '2025-09-15 03:17:38.543386', 'step': 1638, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:17:39.127497', 'step': 1638, 'epoch': 3} {'type': 'pplx', 'content': 129207242.84349388, 'timestamp': '2025-09-15 03:17:39.129391', 'step': 1638, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:39.158372', 'step': 1638, 'epoch': 3} {'type': 'loss', 'content': 2.8826221750932746e-05, 'timestamp': '2025-09-15 03:17:39.164900', 'step': 1639, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:39.195486', 'step': 1639, 'epoch': 3} {'type': 'loss', 'content': 0.00022367548081092536, 'timestamp': '2025-09-15 03:17:39.219110', 'step': 1640, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:39.250174', 'step': 1640, 'epoch': 3} {'type': 'loss', 'content': 0.0002002513501793146, 'timestamp': '2025-09-15 03:17:39.252287', 'step': 1641, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-15 03:17:39.283472', 'step': 1641, 'epoch': 3} {'type': 'loss', 'content': 5.098511974210851e-05, 'timestamp': '2025-09-15 03:17:39.293213', 'step': 1642, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:39.323618', 'step': 1642, 'epoch': 3} {'type': 'loss', 'content': 0.0005090952618047595, 'timestamp': '2025-09-15 03:17:39.325691', 'step': 1643, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:39.356005', 'step': 1643, 'epoch': 3} {'type': 'loss', 'content': 0.00013635992945637554, 'timestamp': '2025-09-15 03:17:39.381004', 'step': 1644, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:17:39.412639', 'step': 1644, 'epoch': 3} {'type': 'loss', 'content': 0.0062268441542983055, 'timestamp': '2025-09-15 03:17:39.417494', 'step': 1645, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:39.448202', 'step': 1645, 'epoch': 3} {'type': 'loss', 'content': 0.00019006979709956795, 'timestamp': '2025-09-15 03:17:39.450407', 'step': 1646, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:39.480972', 'step': 1646, 'epoch': 3} {'type': 'loss', 'content': 5.067802339908667e-05, 'timestamp': '2025-09-15 03:17:39.482899', 'step': 1647, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:17:39.514222', 'step': 1647, 'epoch': 3} {'type': 'loss', 'content': 0.00024163494526874274, 'timestamp': '2025-09-15 03:17:39.542616', 'step': 1648, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:17:39.575048', 'step': 1648, 'epoch': 3} {'type': 'loss', 'content': 0.00035120677785016596, 'timestamp': '2025-09-15 03:17:39.579842', 'step': 1649, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:39.611782', 'step': 1649, 'epoch': 3} {'type': 'loss', 'content': 5.8132307458436117e-05, 'timestamp': '2025-09-15 03:17:39.613926', 'step': 1650, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-15 03:17:39.645586', 'step': 1650, 'epoch': 3} {'type': 'loss', 'content': 0.0001509405265096575, 'timestamp': '2025-09-15 03:17:39.655709', 'step': 1651, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:39.686160', 'step': 1651, 'epoch': 3} {'type': 'loss', 'content': 3.116812513326295e-05, 'timestamp': '2025-09-15 03:17:39.711410', 'step': 1652, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:17:39.741900', 'step': 1652, 'epoch': 3} {'type': 'loss', 'content': 9.823811706155539e-05, 'timestamp': '2025-09-15 03:17:39.747089', 'step': 1653, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:39.777467', 'step': 1653, 'epoch': 3} {'type': 'loss', 'content': 2.2268699467531405e-05, 'timestamp': '2025-09-15 03:17:39.779608', 'step': 1654, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:39.810971', 'step': 1654, 'epoch': 3} {'type': 'loss', 'content': 0.0004127257561776787, 'timestamp': '2025-09-15 03:17:39.813080', 'step': 1655, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:39.843089', 'step': 1655, 'epoch': 3} {'type': 'loss', 'content': 0.00010350901720812544, 'timestamp': '2025-09-15 03:17:39.866359', 'step': 1656, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:39.895946', 'step': 1656, 'epoch': 3} {'type': 'loss', 'content': 0.00023793868604116142, 'timestamp': '2025-09-15 03:17:39.897910', 'step': 1657, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:39.928427', 'step': 1657, 'epoch': 3} {'type': 'loss', 'content': 3.0240338674047962e-05, 'timestamp': '2025-09-15 03:17:39.930544', 'step': 1658, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:39.960872', 'step': 1658, 'epoch': 3} {'type': 'loss', 'content': 0.000498669920489192, 'timestamp': '2025-09-15 03:17:39.963099', 'step': 1659, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:39.993800', 'step': 1659, 'epoch': 3} {'type': 'loss', 'content': 3.756927981157787e-05, 'timestamp': '2025-09-15 03:17:40.019025', 'step': 1660, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:40.049851', 'step': 1660, 'epoch': 3} {'type': 'loss', 'content': 0.0003786278539337218, 'timestamp': '2025-09-15 03:17:40.054126', 'step': 1661, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:40.084162', 'step': 1661, 'epoch': 3} {'type': 'loss', 'content': 0.0014668498188257217, 'timestamp': '2025-09-15 03:17:40.086820', 'step': 1662, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:17:40.117725', 'step': 1662, 'epoch': 3} {'type': 'loss', 'content': 3.2887041015783325e-05, 'timestamp': '2025-09-15 03:17:40.124820', 'step': 1663, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:40.155642', 'step': 1663, 'epoch': 3} {'type': 'loss', 'content': 0.0002688818785827607, 'timestamp': '2025-09-15 03:17:40.180454', 'step': 1664, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 336], 'flops': 9966940982208}, 'timestamp': '2025-09-15 03:17:40.211731', 'step': 1664, 'epoch': 3} {'type': 'loss', 'content': 0.00021670998830813915, 'timestamp': '2025-09-15 03:17:40.224444', 'step': 1665, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:40.254860', 'step': 1665, 'epoch': 3} {'type': 'loss', 'content': 0.03699951618909836, 'timestamp': '2025-09-15 03:17:40.256896', 'step': 1666, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:40.287111', 'step': 1666, 'epoch': 3} {'type': 'loss', 'content': 4.46015692432411e-05, 'timestamp': '2025-09-15 03:17:40.289244', 'step': 1667, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:40.318654', 'step': 1667, 'epoch': 3} {'type': 'loss', 'content': 4.677437027567066e-05, 'timestamp': '2025-09-15 03:17:40.341998', 'step': 1668, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:17:40.373612', 'step': 1668, 'epoch': 3} {'type': 'loss', 'content': 2.693878013815265e-05, 'timestamp': '2025-09-15 03:17:40.378392', 'step': 1669, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:40.408649', 'step': 1669, 'epoch': 3} {'type': 'loss', 'content': 9.786576265469193e-05, 'timestamp': '2025-09-15 03:17:40.410841', 'step': 1670, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:40.441928', 'step': 1670, 'epoch': 3} {'type': 'loss', 'content': 0.00011372385051799938, 'timestamp': '2025-09-15 03:17:40.444060', 'step': 1671, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:40.476153', 'step': 1671, 'epoch': 3} {'type': 'loss', 'content': 4.8357556806877255e-05, 'timestamp': '2025-09-15 03:17:40.501045', 'step': 1672, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:40.537066', 'step': 1672, 'epoch': 3} {'type': 'loss', 'content': 0.025152156129479408, 'timestamp': '2025-09-15 03:17:40.539372', 'step': 1673, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:40.570462', 'step': 1673, 'epoch': 3} {'type': 'loss', 'content': 0.0001292003144044429, 'timestamp': '2025-09-15 03:17:40.572601', 'step': 1674, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:40.602754', 'step': 1674, 'epoch': 3} {'type': 'loss', 'content': 7.244075823109597e-05, 'timestamp': '2025-09-15 03:17:40.605069', 'step': 1675, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:40.635816', 'step': 1675, 'epoch': 3} {'type': 'loss', 'content': 6.385731103364378e-05, 'timestamp': '2025-09-15 03:17:40.659405', 'step': 1676, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:17:40.690011', 'step': 1676, 'epoch': 3} {'type': 'loss', 'content': 4.8743659135652706e-05, 'timestamp': '2025-09-15 03:17:40.694849', 'step': 1677, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:17:41.281619', 'step': 1677, 'epoch': 3} {'type': 'pplx', 'content': 135761003.76394957, 'timestamp': '2025-09-15 03:17:41.286975', 'step': 1677, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:41.320621', 'step': 1677, 'epoch': 3} {'type': 'loss', 'content': 4.129093940719031e-05, 'timestamp': '2025-09-15 03:17:41.323529', 'step': 1678, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:41.354603', 'step': 1678, 'epoch': 3} {'type': 'loss', 'content': 9.938734729075804e-05, 'timestamp': '2025-09-15 03:17:41.361197', 'step': 1679, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:41.391695', 'step': 1679, 'epoch': 3} {'type': 'loss', 'content': 0.0008803194505162537, 'timestamp': '2025-09-15 03:17:41.416373', 'step': 1680, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:41.451837', 'step': 1680, 'epoch': 3} {'type': 'loss', 'content': 4.711677320301533e-05, 'timestamp': '2025-09-15 03:17:41.458294', 'step': 1681, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-15 03:17:41.492780', 'step': 1681, 'epoch': 3} {'type': 'loss', 'content': 0.0001084799223463051, 'timestamp': '2025-09-15 03:17:41.500333', 'step': 1682, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:41.531971', 'step': 1682, 'epoch': 3} {'type': 'loss', 'content': 4.469270788831636e-05, 'timestamp': '2025-09-15 03:17:41.534657', 'step': 1683, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:17:41.565164', 'step': 1683, 'epoch': 3} {'type': 'loss', 'content': 0.00013767420023214072, 'timestamp': '2025-09-15 03:17:41.593367', 'step': 1684, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:41.623730', 'step': 1684, 'epoch': 3} {'type': 'loss', 'content': 8.43398374854587e-05, 'timestamp': '2025-09-15 03:17:41.625900', 'step': 1685, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:41.656635', 'step': 1685, 'epoch': 3} {'type': 'loss', 'content': 4.093054303666577e-05, 'timestamp': '2025-09-15 03:17:41.659183', 'step': 1686, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:41.689677', 'step': 1686, 'epoch': 3} {'type': 'loss', 'content': 6.223317177500576e-05, 'timestamp': '2025-09-15 03:17:41.696270', 'step': 1687, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:41.726827', 'step': 1687, 'epoch': 3} {'type': 'loss', 'content': 7.752107921987772e-05, 'timestamp': '2025-09-15 03:17:41.750215', 'step': 1688, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:17:41.781636', 'step': 1688, 'epoch': 3} {'type': 'loss', 'content': 0.0001287296909140423, 'timestamp': '2025-09-15 03:17:41.786424', 'step': 1689, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:41.816332', 'step': 1689, 'epoch': 3} {'type': 'loss', 'content': 6.89863009029068e-05, 'timestamp': '2025-09-15 03:17:41.818560', 'step': 1690, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:41.848871', 'step': 1690, 'epoch': 3} {'type': 'loss', 'content': 9.750080789672211e-05, 'timestamp': '2025-09-15 03:17:41.851272', 'step': 1691, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:41.881635', 'step': 1691, 'epoch': 3} {'type': 'loss', 'content': 3.922905307263136e-05, 'timestamp': '2025-09-15 03:17:41.905383', 'step': 1692, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:41.936710', 'step': 1692, 'epoch': 3} {'type': 'loss', 'content': 5.144997703609988e-05, 'timestamp': '2025-09-15 03:17:41.938888', 'step': 1693, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:41.969565', 'step': 1693, 'epoch': 3} {'type': 'loss', 'content': 2.403518919891212e-05, 'timestamp': '2025-09-15 03:17:41.971654', 'step': 1694, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:42.001595', 'step': 1694, 'epoch': 3} {'type': 'loss', 'content': 0.0002912590862251818, 'timestamp': '2025-09-15 03:17:42.003765', 'step': 1695, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:42.034418', 'step': 1695, 'epoch': 3} {'type': 'loss', 'content': 0.00025712628848850727, 'timestamp': '2025-09-15 03:17:42.059188', 'step': 1696, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:42.089426', 'step': 1696, 'epoch': 3} {'type': 'loss', 'content': 7.03453624737449e-05, 'timestamp': '2025-09-15 03:17:42.091097', 'step': 1697, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:42.121163', 'step': 1697, 'epoch': 3} {'type': 'loss', 'content': 6.731459870934486e-05, 'timestamp': '2025-09-15 03:17:42.123087', 'step': 1698, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:42.153182', 'step': 1698, 'epoch': 3} {'type': 'loss', 'content': 0.0017748010577633977, 'timestamp': '2025-09-15 03:17:42.157329', 'step': 1699, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:42.188002', 'step': 1699, 'epoch': 3} {'type': 'loss', 'content': 6.168592517497018e-05, 'timestamp': '2025-09-15 03:17:42.211696', 'step': 1700, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:42.241704', 'step': 1700, 'epoch': 3} {'type': 'loss', 'content': 0.0001511227892478928, 'timestamp': '2025-09-15 03:17:42.243621', 'step': 1701, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:42.273960', 'step': 1701, 'epoch': 3} {'type': 'loss', 'content': 0.0003747472946997732, 'timestamp': '2025-09-15 03:17:42.280554', 'step': 1702, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-15 03:17:42.311650', 'step': 1702, 'epoch': 3} {'type': 'loss', 'content': 0.00010352622484788299, 'timestamp': '2025-09-15 03:17:42.321099', 'step': 1703, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:42.353220', 'step': 1703, 'epoch': 3} {'type': 'loss', 'content': 0.0008733622962608933, 'timestamp': '2025-09-15 03:17:42.380767', 'step': 1704, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:42.411800', 'step': 1704, 'epoch': 3} {'type': 'loss', 'content': 0.00016946492542047054, 'timestamp': '2025-09-15 03:17:42.415286', 'step': 1705, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:42.445197', 'step': 1705, 'epoch': 3} {'type': 'loss', 'content': 0.00017234814004041255, 'timestamp': '2025-09-15 03:17:42.447671', 'step': 1706, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:42.479684', 'step': 1706, 'epoch': 3} {'type': 'loss', 'content': 6.766520527889952e-05, 'timestamp': '2025-09-15 03:17:42.481953', 'step': 1707, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:42.513059', 'step': 1707, 'epoch': 3} {'type': 'loss', 'content': 0.000172713422216475, 'timestamp': '2025-09-15 03:17:42.538190', 'step': 1708, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:42.568272', 'step': 1708, 'epoch': 3} {'type': 'loss', 'content': 0.0005255985306575894, 'timestamp': '2025-09-15 03:17:42.570532', 'step': 1709, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:42.601059', 'step': 1709, 'epoch': 3} {'type': 'loss', 'content': 5.218220758251846e-05, 'timestamp': '2025-09-15 03:17:42.603193', 'step': 1710, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:42.633773', 'step': 1710, 'epoch': 3} {'type': 'loss', 'content': 0.00021290150471031666, 'timestamp': '2025-09-15 03:17:42.637784', 'step': 1711, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:42.668196', 'step': 1711, 'epoch': 3} {'type': 'loss', 'content': 0.0010456795571371913, 'timestamp': '2025-09-15 03:17:42.692181', 'step': 1712, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:42.722741', 'step': 1712, 'epoch': 3} {'type': 'loss', 'content': 7.15435016900301e-05, 'timestamp': '2025-09-15 03:17:42.725108', 'step': 1713, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:42.757247', 'step': 1713, 'epoch': 3} {'type': 'loss', 'content': 0.0016514122253283858, 'timestamp': '2025-09-15 03:17:42.760580', 'step': 1714, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:42.791853', 'step': 1714, 'epoch': 3} {'type': 'loss', 'content': 6.494522676803172e-05, 'timestamp': '2025-09-15 03:17:42.794005', 'step': 1715, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:42.825168', 'step': 1715, 'epoch': 3} {'type': 'loss', 'content': 5.324947051121853e-05, 'timestamp': '2025-09-15 03:17:42.848715', 'step': 1716, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:17:43.438548', 'step': 1716, 'epoch': 3} {'type': 'pplx', 'content': 138965352.20121047, 'timestamp': '2025-09-15 03:17:43.440687', 'step': 1716, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:43.469520', 'step': 1716, 'epoch': 3} {'type': 'loss', 'content': 6.845069583505392e-05, 'timestamp': '2025-09-15 03:17:43.471651', 'step': 1717, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:43.502230', 'step': 1717, 'epoch': 3} {'type': 'loss', 'content': 3.28231108142063e-05, 'timestamp': '2025-09-15 03:17:43.506566', 'step': 1718, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:43.537483', 'step': 1718, 'epoch': 3} {'type': 'loss', 'content': 0.0023003576789051294, 'timestamp': '2025-09-15 03:17:43.544153', 'step': 1719, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:43.574979', 'step': 1719, 'epoch': 3} {'type': 'loss', 'content': 0.00016819333541207016, 'timestamp': '2025-09-15 03:17:43.600063', 'step': 1720, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:43.636281', 'step': 1720, 'epoch': 3} {'type': 'loss', 'content': 7.036477472865954e-05, 'timestamp': '2025-09-15 03:17:43.638425', 'step': 1721, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:43.670225', 'step': 1721, 'epoch': 3} {'type': 'loss', 'content': 7.370777893811464e-05, 'timestamp': '2025-09-15 03:17:43.676826', 'step': 1722, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:43.707432', 'step': 1722, 'epoch': 3} {'type': 'loss', 'content': 0.00018600482144393027, 'timestamp': '2025-09-15 03:17:43.709773', 'step': 1723, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:43.740517', 'step': 1723, 'epoch': 3} {'type': 'loss', 'content': 0.000139830241096206, 'timestamp': '2025-09-15 03:17:43.764287', 'step': 1724, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:43.795428', 'step': 1724, 'epoch': 3} {'type': 'loss', 'content': 0.029730452224612236, 'timestamp': '2025-09-15 03:17:43.797441', 'step': 1725, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:43.827379', 'step': 1725, 'epoch': 3} {'type': 'loss', 'content': 0.00035227558691985905, 'timestamp': '2025-09-15 03:17:43.829398', 'step': 1726, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:43.859975', 'step': 1726, 'epoch': 3} {'type': 'loss', 'content': 0.00037362094735726714, 'timestamp': '2025-09-15 03:17:43.862117', 'step': 1727, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:43.894698', 'step': 1727, 'epoch': 3} {'type': 'loss', 'content': 8.00935085862875e-05, 'timestamp': '2025-09-15 03:17:43.919587', 'step': 1728, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:43.952324', 'step': 1728, 'epoch': 3} {'type': 'loss', 'content': 0.0001457637408748269, 'timestamp': '2025-09-15 03:17:43.954398', 'step': 1729, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:43.985154', 'step': 1729, 'epoch': 3} {'type': 'loss', 'content': 0.00011651766544673592, 'timestamp': '2025-09-15 03:17:43.987336', 'step': 1730, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 272], 'flops': 8068526078144}, 'timestamp': '2025-09-15 03:17:44.017947', 'step': 1730, 'epoch': 3} {'type': 'loss', 'content': 0.00023613232770003378, 'timestamp': '2025-09-15 03:17:44.027936', 'step': 1731, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:44.058871', 'step': 1731, 'epoch': 3} {'type': 'loss', 'content': 4.856818122789264e-05, 'timestamp': '2025-09-15 03:17:44.082395', 'step': 1732, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:44.113761', 'step': 1732, 'epoch': 3} {'type': 'loss', 'content': 3.342150739626959e-05, 'timestamp': '2025-09-15 03:17:44.115756', 'step': 1733, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:44.146674', 'step': 1733, 'epoch': 3} {'type': 'loss', 'content': 0.0007277569966390729, 'timestamp': '2025-09-15 03:17:44.148838', 'step': 1734, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:44.179897', 'step': 1734, 'epoch': 3} {'type': 'loss', 'content': 2.9094177079969086e-05, 'timestamp': '2025-09-15 03:17:44.181940', 'step': 1735, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:44.213149', 'step': 1735, 'epoch': 3} {'type': 'loss', 'content': 0.00013098781346343458, 'timestamp': '2025-09-15 03:17:44.237940', 'step': 1736, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:44.268487', 'step': 1736, 'epoch': 3} {'type': 'loss', 'content': 0.0003191656433045864, 'timestamp': '2025-09-15 03:17:44.271011', 'step': 1737, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:44.301703', 'step': 1737, 'epoch': 3} {'type': 'loss', 'content': 0.00011755731975426897, 'timestamp': '2025-09-15 03:17:44.304014', 'step': 1738, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-15 03:17:44.338186', 'step': 1738, 'epoch': 3} {'type': 'loss', 'content': 6.14327727816999e-05, 'timestamp': '2025-09-15 03:17:44.345775', 'step': 1739, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:44.377737', 'step': 1739, 'epoch': 3} {'type': 'loss', 'content': 8.636755228508264e-05, 'timestamp': '2025-09-15 03:17:44.402384', 'step': 1740, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:44.433147', 'step': 1740, 'epoch': 3} {'type': 'loss', 'content': 0.000406697450671345, 'timestamp': '2025-09-15 03:17:44.435279', 'step': 1741, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:44.465395', 'step': 1741, 'epoch': 3} {'type': 'loss', 'content': 8.220264862757176e-05, 'timestamp': '2025-09-15 03:17:44.469820', 'step': 1742, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:44.500540', 'step': 1742, 'epoch': 3} {'type': 'loss', 'content': 0.0001976119092432782, 'timestamp': '2025-09-15 03:17:44.503207', 'step': 1743, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-15 03:17:44.533651', 'step': 1743, 'epoch': 3} {'type': 'loss', 'content': 0.00012376924860291183, 'timestamp': '2025-09-15 03:17:44.562249', 'step': 1744, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:44.591645', 'step': 1744, 'epoch': 3} {'type': 'loss', 'content': 0.00025345777976326644, 'timestamp': '2025-09-15 03:17:44.593899', 'step': 1745, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:44.624985', 'step': 1745, 'epoch': 3} {'type': 'loss', 'content': 0.014335216954350471, 'timestamp': '2025-09-15 03:17:44.631391', 'step': 1746, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:44.662438', 'step': 1746, 'epoch': 3} {'type': 'loss', 'content': 0.006056909915059805, 'timestamp': '2025-09-15 03:17:44.664566', 'step': 1747, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:44.695551', 'step': 1747, 'epoch': 3} {'type': 'loss', 'content': 0.00012910777877550572, 'timestamp': '2025-09-15 03:17:44.719509', 'step': 1748, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 304], 'flops': 9017733530176}, 'timestamp': '2025-09-15 03:17:44.750316', 'step': 1748, 'epoch': 3} {'type': 'loss', 'content': 0.00021627625392284244, 'timestamp': '2025-09-15 03:17:44.759954', 'step': 1749, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:44.791299', 'step': 1749, 'epoch': 3} {'type': 'loss', 'content': 4.3783224100479856e-05, 'timestamp': '2025-09-15 03:17:44.795272', 'step': 1750, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:44.826847', 'step': 1750, 'epoch': 3} {'type': 'loss', 'content': 0.0003299799282103777, 'timestamp': '2025-09-15 03:17:44.830789', 'step': 1751, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:44.862992', 'step': 1751, 'epoch': 3} {'type': 'loss', 'content': 0.00012049104407196864, 'timestamp': '2025-09-15 03:17:44.888120', 'step': 1752, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:17:44.919159', 'step': 1752, 'epoch': 3} {'type': 'loss', 'content': 0.00011262983025517315, 'timestamp': '2025-09-15 03:17:44.924025', 'step': 1753, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:44.955482', 'step': 1753, 'epoch': 3} {'type': 'loss', 'content': 9.659709030529484e-05, 'timestamp': '2025-09-15 03:17:44.957823', 'step': 1754, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:44.990112', 'step': 1754, 'epoch': 3} {'type': 'loss', 'content': 0.0011848767753690481, 'timestamp': '2025-09-15 03:17:44.994887', 'step': 1755, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:17:45.595571', 'step': 1755, 'epoch': 3} {'type': 'pplx', 'content': 140086331.41677055, 'timestamp': '2025-09-15 03:17:45.597889', 'step': 1755, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:45.626885', 'step': 1755, 'epoch': 3} {'type': 'loss', 'content': 5.509558832272887e-05, 'timestamp': '2025-09-15 03:17:45.654311', 'step': 1756, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:45.687158', 'step': 1756, 'epoch': 3} {'type': 'loss', 'content': 0.0002119831769960001, 'timestamp': '2025-09-15 03:17:45.689524', 'step': 1757, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:45.720028', 'step': 1757, 'epoch': 3} {'type': 'loss', 'content': 0.0004407106607686728, 'timestamp': '2025-09-15 03:17:45.724456', 'step': 1758, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-15 03:17:45.754924', 'step': 1758, 'epoch': 3} {'type': 'loss', 'content': 0.00029242291930131614, 'timestamp': '2025-09-15 03:17:45.762539', 'step': 1759, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:45.793873', 'step': 1759, 'epoch': 3} {'type': 'loss', 'content': 0.00021790176106151193, 'timestamp': '2025-09-15 03:17:45.818803', 'step': 1760, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:45.849546', 'step': 1760, 'epoch': 3} {'type': 'loss', 'content': 0.00012010778300464153, 'timestamp': '2025-09-15 03:17:45.853836', 'step': 1761, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 80], 'flops': 2373281365952}, 'timestamp': '2025-09-15 03:17:45.884420', 'step': 1761, 'epoch': 3} {'type': 'loss', 'content': 0.00024840538389980793, 'timestamp': '2025-09-15 03:17:45.886840', 'step': 1762, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:45.917588', 'step': 1762, 'epoch': 3} {'type': 'loss', 'content': 0.00018551461107563227, 'timestamp': '2025-09-15 03:17:45.919864', 'step': 1763, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:45.950975', 'step': 1763, 'epoch': 3} {'type': 'loss', 'content': 0.0006247073761187494, 'timestamp': '2025-09-15 03:17:45.978512', 'step': 1764, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:46.009176', 'step': 1764, 'epoch': 3} {'type': 'loss', 'content': 9.605469676898792e-05, 'timestamp': '2025-09-15 03:17:46.011359', 'step': 1765, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:46.041856', 'step': 1765, 'epoch': 3} {'type': 'loss', 'content': 0.00016489412519149482, 'timestamp': '2025-09-15 03:17:46.044002', 'step': 1766, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:46.075531', 'step': 1766, 'epoch': 3} {'type': 'loss', 'content': 8.419169898843393e-05, 'timestamp': '2025-09-15 03:17:46.077708', 'step': 1767, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:46.108142', 'step': 1767, 'epoch': 3} {'type': 'loss', 'content': 0.0018693411257117987, 'timestamp': '2025-09-15 03:17:46.131586', 'step': 1768, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:46.167000', 'step': 1768, 'epoch': 3} {'type': 'loss', 'content': 0.0003774032520595938, 'timestamp': '2025-09-15 03:17:46.169305', 'step': 1769, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:46.200748', 'step': 1769, 'epoch': 3} {'type': 'loss', 'content': 0.02417862042784691, 'timestamp': '2025-09-15 03:17:46.207098', 'step': 1770, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:46.237856', 'step': 1770, 'epoch': 3} {'type': 'loss', 'content': 5.752419019700028e-05, 'timestamp': '2025-09-15 03:17:46.240370', 'step': 1771, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:46.271758', 'step': 1771, 'epoch': 3} {'type': 'loss', 'content': 9.621244680602103e-05, 'timestamp': '2025-09-15 03:17:46.296919', 'step': 1772, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:46.328942', 'step': 1772, 'epoch': 3} {'type': 'loss', 'content': 6.722491525579244e-05, 'timestamp': '2025-09-15 03:17:46.333281', 'step': 1773, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:46.364030', 'step': 1773, 'epoch': 3} {'type': 'loss', 'content': 0.00020308203238528222, 'timestamp': '2025-09-15 03:17:46.370538', 'step': 1774, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:46.401083', 'step': 1774, 'epoch': 3} {'type': 'loss', 'content': 0.000121831770229619, 'timestamp': '2025-09-15 03:17:46.403224', 'step': 1775, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:46.433251', 'step': 1775, 'epoch': 3} {'type': 'loss', 'content': 3.221361475880258e-05, 'timestamp': '2025-09-15 03:17:46.456832', 'step': 1776, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:46.487454', 'step': 1776, 'epoch': 3} {'type': 'loss', 'content': 8.978391997516155e-05, 'timestamp': '2025-09-15 03:17:46.489916', 'step': 1777, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:46.520135', 'step': 1777, 'epoch': 3} {'type': 'loss', 'content': 3.9384856791002676e-05, 'timestamp': '2025-09-15 03:17:46.522192', 'step': 1778, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:46.552442', 'step': 1778, 'epoch': 3} {'type': 'loss', 'content': 0.0006986562511883676, 'timestamp': '2025-09-15 03:17:46.554741', 'step': 1779, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:46.585535', 'step': 1779, 'epoch': 3} {'type': 'loss', 'content': 3.0208628231775947e-05, 'timestamp': '2025-09-15 03:17:46.610407', 'step': 1780, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:46.640734', 'step': 1780, 'epoch': 3} {'type': 'loss', 'content': 0.00015801134577486664, 'timestamp': '2025-09-15 03:17:46.642717', 'step': 1781, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:46.673852', 'step': 1781, 'epoch': 3} {'type': 'loss', 'content': 7.350670784944668e-05, 'timestamp': '2025-09-15 03:17:46.677442', 'step': 1782, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:46.708425', 'step': 1782, 'epoch': 3} {'type': 'loss', 'content': 1.752012576616835e-05, 'timestamp': '2025-09-15 03:17:46.710685', 'step': 1783, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:46.740955', 'step': 1783, 'epoch': 3} {'type': 'loss', 'content': 0.00017398015188518912, 'timestamp': '2025-09-15 03:17:46.765667', 'step': 1784, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:46.796491', 'step': 1784, 'epoch': 3} {'type': 'loss', 'content': 0.00010306698823114857, 'timestamp': '2025-09-15 03:17:46.800677', 'step': 1785, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:46.831725', 'step': 1785, 'epoch': 3} {'type': 'loss', 'content': 6.714624032611027e-05, 'timestamp': '2025-09-15 03:17:46.835119', 'step': 1786, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:46.866345', 'step': 1786, 'epoch': 3} {'type': 'loss', 'content': 0.0005150840734131634, 'timestamp': '2025-09-15 03:17:46.872547', 'step': 1787, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:46.905436', 'step': 1787, 'epoch': 3} {'type': 'loss', 'content': 0.0004283357411623001, 'timestamp': '2025-09-15 03:17:46.929174', 'step': 1788, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:46.960322', 'step': 1788, 'epoch': 3} {'type': 'loss', 'content': 0.0003343620337545872, 'timestamp': '2025-09-15 03:17:46.962422', 'step': 1789, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:46.999647', 'step': 1789, 'epoch': 3} {'type': 'loss', 'content': 0.000761338509619236, 'timestamp': '2025-09-15 03:17:47.001884', 'step': 1790, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:47.037223', 'step': 1790, 'epoch': 3} {'type': 'loss', 'content': 8.442411490250379e-05, 'timestamp': '2025-09-15 03:17:47.039198', 'step': 1791, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:47.069365', 'step': 1791, 'epoch': 3} {'type': 'loss', 'content': 0.0005162957822903991, 'timestamp': '2025-09-15 03:17:47.094420', 'step': 1792, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:47.126246', 'step': 1792, 'epoch': 3} {'type': 'loss', 'content': 0.000750789069570601, 'timestamp': '2025-09-15 03:17:47.130381', 'step': 1793, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:47.161590', 'step': 1793, 'epoch': 3} {'type': 'loss', 'content': 0.0051539321430027485, 'timestamp': '2025-09-15 03:17:47.163674', 'step': 1794, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:17:47.747823', 'step': 1794, 'epoch': 3} {'type': 'pplx', 'content': 139876290.7306363, 'timestamp': '2025-09-15 03:17:47.750267', 'step': 1794, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:47.780000', 'step': 1794, 'epoch': 3} {'type': 'loss', 'content': 5.731660712626763e-05, 'timestamp': '2025-09-15 03:17:47.783328', 'step': 1795, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:47.814510', 'step': 1795, 'epoch': 3} {'type': 'loss', 'content': 6.324555579340085e-05, 'timestamp': '2025-09-15 03:17:47.838194', 'step': 1796, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:47.868953', 'step': 1796, 'epoch': 3} {'type': 'loss', 'content': 0.00020690243400167674, 'timestamp': '2025-09-15 03:17:47.870933', 'step': 1797, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:47.901019', 'step': 1797, 'epoch': 3} {'type': 'loss', 'content': 8.057226659730077e-05, 'timestamp': '2025-09-15 03:17:47.907824', 'step': 1798, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:47.937934', 'step': 1798, 'epoch': 3} {'type': 'loss', 'content': 5.081558629171923e-05, 'timestamp': '2025-09-15 03:17:47.940073', 'step': 1799, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:47.970420', 'step': 1799, 'epoch': 3} {'type': 'loss', 'content': 0.000641024496871978, 'timestamp': '2025-09-15 03:17:47.995209', 'step': 1800, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:48.025733', 'step': 1800, 'epoch': 3} {'type': 'loss', 'content': 7.569055742351338e-05, 'timestamp': '2025-09-15 03:17:48.027799', 'step': 1801, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:48.058529', 'step': 1801, 'epoch': 3} {'type': 'loss', 'content': 3.237251439713873e-05, 'timestamp': '2025-09-15 03:17:48.062471', 'step': 1802, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:48.092147', 'step': 1802, 'epoch': 3} {'type': 'loss', 'content': 0.0009622335783205926, 'timestamp': '2025-09-15 03:17:48.096558', 'step': 1803, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:48.127561', 'step': 1803, 'epoch': 3} {'type': 'loss', 'content': 0.00022668966266792268, 'timestamp': '2025-09-15 03:17:48.152553', 'step': 1804, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:48.183191', 'step': 1804, 'epoch': 3} {'type': 'loss', 'content': 0.0006432764348573983, 'timestamp': '2025-09-15 03:17:48.185065', 'step': 1805, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:48.217108', 'step': 1805, 'epoch': 3} {'type': 'loss', 'content': 0.0002396961353952065, 'timestamp': '2025-09-15 03:17:48.220989', 'step': 1806, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:48.252007', 'step': 1806, 'epoch': 3} {'type': 'loss', 'content': 2.7812382541014813e-05, 'timestamp': '2025-09-15 03:17:48.258314', 'step': 1807, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:48.288829', 'step': 1807, 'epoch': 3} {'type': 'loss', 'content': 0.028440991416573524, 'timestamp': '2025-09-15 03:17:48.316365', 'step': 1808, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:48.347006', 'step': 1808, 'epoch': 3} {'type': 'loss', 'content': 5.9055400924989954e-05, 'timestamp': '2025-09-15 03:17:48.349409', 'step': 1809, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:48.379531', 'step': 1809, 'epoch': 3} {'type': 'loss', 'content': 6.185309030115604e-05, 'timestamp': '2025-09-15 03:17:48.383699', 'step': 1810, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:48.414706', 'step': 1810, 'epoch': 3} {'type': 'loss', 'content': 0.00011187683412572369, 'timestamp': '2025-09-15 03:17:48.421412', 'step': 1811, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:48.452106', 'step': 1811, 'epoch': 3} {'type': 'loss', 'content': 0.002612639917060733, 'timestamp': '2025-09-15 03:17:48.475812', 'step': 1812, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:48.506139', 'step': 1812, 'epoch': 3} {'type': 'loss', 'content': 0.00084578653331846, 'timestamp': '2025-09-15 03:17:48.508136', 'step': 1813, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:48.538928', 'step': 1813, 'epoch': 3} {'type': 'loss', 'content': 0.00016805656196083874, 'timestamp': '2025-09-15 03:17:48.545362', 'step': 1814, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:48.576927', 'step': 1814, 'epoch': 3} {'type': 'loss', 'content': 0.0002695379371289164, 'timestamp': '2025-09-15 03:17:48.579339', 'step': 1815, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:48.609711', 'step': 1815, 'epoch': 3} {'type': 'loss', 'content': 0.0001330919039901346, 'timestamp': '2025-09-15 03:17:48.633494', 'step': 1816, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:48.664532', 'step': 1816, 'epoch': 3} {'type': 'loss', 'content': 3.694388215080835e-05, 'timestamp': '2025-09-15 03:17:48.668735', 'step': 1817, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:48.699747', 'step': 1817, 'epoch': 3} {'type': 'loss', 'content': 8.705775690032169e-05, 'timestamp': '2025-09-15 03:17:48.703552', 'step': 1818, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:48.734292', 'step': 1818, 'epoch': 3} {'type': 'loss', 'content': 3.831345748039894e-05, 'timestamp': '2025-09-15 03:17:48.736500', 'step': 1819, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:48.768337', 'step': 1819, 'epoch': 3} {'type': 'loss', 'content': 6.43322491669096e-05, 'timestamp': '2025-09-15 03:17:48.795922', 'step': 1820, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:17:48.827213', 'step': 1820, 'epoch': 3} {'type': 'loss', 'content': 0.00010557322093518451, 'timestamp': '2025-09-15 03:17:48.831882', 'step': 1821, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:48.862576', 'step': 1821, 'epoch': 3} {'type': 'loss', 'content': 3.788010144489817e-05, 'timestamp': '2025-09-15 03:17:48.865094', 'step': 1822, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:48.895413', 'step': 1822, 'epoch': 3} {'type': 'loss', 'content': 6.223905802471563e-05, 'timestamp': '2025-09-15 03:17:48.898615', 'step': 1823, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:48.929419', 'step': 1823, 'epoch': 3} {'type': 'loss', 'content': 0.0003868688945658505, 'timestamp': '2025-09-15 03:17:48.954107', 'step': 1824, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:48.985096', 'step': 1824, 'epoch': 3} {'type': 'loss', 'content': 0.00013783354370389134, 'timestamp': '2025-09-15 03:17:48.989397', 'step': 1825, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:49.020157', 'step': 1825, 'epoch': 3} {'type': 'loss', 'content': 2.5953302611014806e-05, 'timestamp': '2025-09-15 03:17:49.023940', 'step': 1826, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:49.054521', 'step': 1826, 'epoch': 3} {'type': 'loss', 'content': 3.0394159693969414e-05, 'timestamp': '2025-09-15 03:17:49.058746', 'step': 1827, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 240], 'flops': 7119318626112}, 'timestamp': '2025-09-15 03:17:49.089242', 'step': 1827, 'epoch': 3} {'type': 'loss', 'content': 7.785171328578144e-05, 'timestamp': '2025-09-15 03:17:49.117503', 'step': 1828, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:49.148405', 'step': 1828, 'epoch': 3} {'type': 'loss', 'content': 0.0018183441134169698, 'timestamp': '2025-09-15 03:17:49.150989', 'step': 1829, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:49.182348', 'step': 1829, 'epoch': 3} {'type': 'loss', 'content': 6.903747998876497e-05, 'timestamp': '2025-09-15 03:17:49.186236', 'step': 1830, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:49.216811', 'step': 1830, 'epoch': 3} {'type': 'loss', 'content': 0.00015899473510216922, 'timestamp': '2025-09-15 03:17:49.218876', 'step': 1831, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:49.249546', 'step': 1831, 'epoch': 3} {'type': 'loss', 'content': 0.001722590415738523, 'timestamp': '2025-09-15 03:17:49.273439', 'step': 1832, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:49.303242', 'step': 1832, 'epoch': 3} {'type': 'loss', 'content': 8.198097202694044e-05, 'timestamp': '2025-09-15 03:17:49.305692', 'step': 1833, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:17:49.889493', 'step': 1833, 'epoch': 3} {'type': 'pplx', 'content': 145811986.31162205, 'timestamp': '2025-09-15 03:17:49.891391', 'step': 1833, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:49.920674', 'step': 1833, 'epoch': 3} {'type': 'loss', 'content': 3.944264608435333e-05, 'timestamp': '2025-09-15 03:17:49.923196', 'step': 1834, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:49.953753', 'step': 1834, 'epoch': 3} {'type': 'loss', 'content': 0.0001038339250953868, 'timestamp': '2025-09-15 03:17:49.957586', 'step': 1835, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:49.987788', 'step': 1835, 'epoch': 3} {'type': 'loss', 'content': 3.5049022699240595e-05, 'timestamp': '2025-09-15 03:17:50.011463', 'step': 1836, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:50.042853', 'step': 1836, 'epoch': 3} {'type': 'loss', 'content': 4.700681893154979e-05, 'timestamp': '2025-09-15 03:17:50.045000', 'step': 1837, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:50.076048', 'step': 1837, 'epoch': 3} {'type': 'loss', 'content': 3.058017682633363e-05, 'timestamp': '2025-09-15 03:17:50.078544', 'step': 1838, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:50.109159', 'step': 1838, 'epoch': 3} {'type': 'loss', 'content': 2.1085737898829393e-05, 'timestamp': '2025-09-15 03:17:50.112888', 'step': 1839, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:50.143781', 'step': 1839, 'epoch': 3} {'type': 'loss', 'content': 9.610828419681638e-05, 'timestamp': '2025-09-15 03:17:50.167469', 'step': 1840, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:50.197108', 'step': 1840, 'epoch': 3} {'type': 'loss', 'content': 0.004703332204371691, 'timestamp': '2025-09-15 03:17:50.199418', 'step': 1841, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:50.229821', 'step': 1841, 'epoch': 3} {'type': 'loss', 'content': 0.0007846274529583752, 'timestamp': '2025-09-15 03:17:50.232116', 'step': 1842, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:50.262965', 'step': 1842, 'epoch': 3} {'type': 'loss', 'content': 9.324204438598827e-05, 'timestamp': '2025-09-15 03:17:50.264999', 'step': 1843, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:50.294921', 'step': 1843, 'epoch': 3} {'type': 'loss', 'content': 2.0824878447456285e-05, 'timestamp': '2025-09-15 03:17:50.318448', 'step': 1844, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:50.351452', 'step': 1844, 'epoch': 3} {'type': 'loss', 'content': 2.5175850169034675e-05, 'timestamp': '2025-09-15 03:17:50.353872', 'step': 1845, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 256], 'flops': 7593922352128}, 'timestamp': '2025-09-15 03:17:50.385865', 'step': 1845, 'epoch': 3} {'type': 'loss', 'content': 2.5445378923905082e-05, 'timestamp': '2025-09-15 03:17:50.393352', 'step': 1846, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:50.424171', 'step': 1846, 'epoch': 3} {'type': 'loss', 'content': 0.00018293499306309968, 'timestamp': '2025-09-15 03:17:50.426359', 'step': 1847, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:50.457724', 'step': 1847, 'epoch': 3} {'type': 'loss', 'content': 0.0002315417950740084, 'timestamp': '2025-09-15 03:17:50.485210', 'step': 1848, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:50.515903', 'step': 1848, 'epoch': 3} {'type': 'loss', 'content': 2.1575300706899725e-05, 'timestamp': '2025-09-15 03:17:50.518880', 'step': 1849, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:50.549547', 'step': 1849, 'epoch': 3} {'type': 'loss', 'content': 7.31429536244832e-05, 'timestamp': '2025-09-15 03:17:50.553778', 'step': 1850, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:50.584547', 'step': 1850, 'epoch': 3} {'type': 'loss', 'content': 0.00030331689049489796, 'timestamp': '2025-09-15 03:17:50.586652', 'step': 1851, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:50.617072', 'step': 1851, 'epoch': 3} {'type': 'loss', 'content': 0.0001617541565792635, 'timestamp': '2025-09-15 03:17:50.641767', 'step': 1852, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:17:50.671890', 'step': 1852, 'epoch': 3} {'type': 'loss', 'content': 0.007455338723957539, 'timestamp': '2025-09-15 03:17:50.673950', 'step': 1853, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:50.704358', 'step': 1853, 'epoch': 3} {'type': 'loss', 'content': 0.00015413833898492157, 'timestamp': '2025-09-15 03:17:50.706491', 'step': 1854, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:50.737855', 'step': 1854, 'epoch': 3} {'type': 'loss', 'content': 8.115199307212606e-05, 'timestamp': '2025-09-15 03:17:50.742300', 'step': 1855, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:17:50.772640', 'step': 1855, 'epoch': 3} {'type': 'loss', 'content': 7.526958506787196e-05, 'timestamp': '2025-09-15 03:17:50.796207', 'step': 1856, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:50.828150', 'step': 1856, 'epoch': 3} {'type': 'loss', 'content': 0.00012373061326798052, 'timestamp': '2025-09-15 03:17:50.833114', 'step': 1857, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:50.865702', 'step': 1857, 'epoch': 3} {'type': 'loss', 'content': 9.806371963350102e-05, 'timestamp': '2025-09-15 03:17:50.872254', 'step': 1858, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:50.904947', 'step': 1858, 'epoch': 3} {'type': 'loss', 'content': 2.61258701357292e-05, 'timestamp': '2025-09-15 03:17:50.911649', 'step': 1859, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 96], 'flops': 2847885091968}, 'timestamp': '2025-09-15 03:17:50.942527', 'step': 1859, 'epoch': 3} {'type': 'loss', 'content': 8.361357322428375e-05, 'timestamp': '2025-09-15 03:17:50.966278', 'step': 1860, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:50.997586', 'step': 1860, 'epoch': 3} {'type': 'loss', 'content': 0.00019799031724687666, 'timestamp': '2025-09-15 03:17:51.002567', 'step': 1861, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:17:51.035498', 'step': 1861, 'epoch': 3} {'type': 'loss', 'content': 0.00023388156841974705, 'timestamp': '2025-09-15 03:17:51.040216', 'step': 1862, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 224], 'flops': 6644714900096}, 'timestamp': '2025-09-15 03:17:51.073505', 'step': 1862, 'epoch': 3} {'type': 'loss', 'content': 0.00021596389706246555, 'timestamp': '2025-09-15 03:17:51.080234', 'step': 1863, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:17:51.110559', 'step': 1863, 'epoch': 3} {'type': 'loss', 'content': 0.044805027544498444, 'timestamp': '2025-09-15 03:17:51.134105', 'step': 1864, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 192], 'flops': 5695507448064}, 'timestamp': '2025-09-15 03:17:51.164511', 'step': 1864, 'epoch': 3} {'type': 'loss', 'content': 0.05255137011408806, 'timestamp': '2025-09-15 03:17:51.166617', 'step': 1865, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:51.197226', 'step': 1865, 'epoch': 3} {'type': 'loss', 'content': 6.220581417437643e-05, 'timestamp': '2025-09-15 03:17:51.201180', 'step': 1866, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:17:51.231987', 'step': 1866, 'epoch': 3} {'type': 'loss', 'content': 0.0002913509670179337, 'timestamp': '2025-09-15 03:17:51.235894', 'step': 1867, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 208], 'flops': 6170111174080}, 'timestamp': '2025-09-15 03:17:51.266330', 'step': 1867, 'epoch': 3} {'type': 'loss', 'content': 0.00011638918658718467, 'timestamp': '2025-09-15 03:17:51.293716', 'step': 1868, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [2, 192], 'flops': 2847885110400}, 'timestamp': '2025-09-15 03:17:51.324458', 'step': 1868, 'epoch': 3} {'type': 'loss', 'content': 3.343486241647042e-05, 'timestamp': '2025-09-15 03:17:51.327696', 'step': 1869, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 224], 'batch_size': 8, 'flops': 4429610391296}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 64], 'batch_size': 8, 'flops': 1265603017216}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 160], 'batch_size': 8, 'flops': 3164007441664}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [8, 208], 'batch_size': 8, 'flops': 4113209653888}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 176], 'batch_size': 8, 'flops': 3480408179072}, {'type': 'perplexity', 'in_batch_dim': [8, 256], 'batch_size': 8, 'flops': 5062411866112}, {'type': 'perplexity', 'in_batch_dim': [8, 128], 'batch_size': 8, 'flops': 2531205966848}, {'type': 'perplexity', 'in_batch_dim': [8, 144], 'batch_size': 8, 'flops': 2847606704256}, {'type': 'perplexity', 'in_batch_dim': [8, 192], 'batch_size': 8, 'flops': 3796808916480}, {'type': 'perplexity', 'in_batch_dim': [5, 144], 'batch_size': 8, 'flops': 2847606704256}], 'timestamp': '2025-09-15 03:17:51.914734', 'step': 1869, 'epoch': 3} {'type': 'pplx', 'content': 146250701.10244825, 'timestamp': '2025-09-15 03:17:51.916520', 'step': 1869, 'epoch': 3} {'type': 'best_pplx', 'content': 52920853.06323647, 'timestamp': '2025-09-15 03:17:51.917998', 'step': 1869, 'epoch': 3} {'type': 'best_step', 'content': 39, 'timestamp': '2025-09-15 03:17:51.919338', 'step': 1869, 'epoch': 3} {'type': 'total_pplx_flops', 'content': 5333250945655808, 'timestamp': '2025-09-15 03:17:51.920707', 'step': 1869, 'epoch': 3} {'type': 'total_train_flops', 'content': 9219668431260864, 'timestamp': '2025-09-15 03:17:51.922367', 'step': 1869, 'epoch': 3}