{'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:18:56.278210', 'step': 0, 'epoch': 0} {'type': 'pplx', 'content': 226674977.87649825, 'timestamp': '2025-09-15 03:18:56.280551', 'step': 0, 'epoch': 0} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:18:56.348646', 'step': 0, 'epoch': 1} {'type': 'loss', 'content': 0.6993563771247864, 'timestamp': '2025-09-15 03:18:56.351544', 'step': 1, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:56.406233', 'step': 1, 'epoch': 1} {'type': 'loss', 'content': 0.6954641342163086, 'timestamp': '2025-09-15 03:18:56.408321', 'step': 2, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:56.438403', 'step': 2, 'epoch': 1} {'type': 'loss', 'content': 0.7254253029823303, 'timestamp': '2025-09-15 03:18:56.440509', 'step': 3, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:56.469959', 'step': 3, 'epoch': 1} {'type': 'loss', 'content': 0.7056644558906555, 'timestamp': '2025-09-15 03:18:56.542440', 'step': 4, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:56.574764', 'step': 4, 'epoch': 1} {'type': 'loss', 'content': 0.11781269311904907, 'timestamp': '2025-09-15 03:18:56.576792', 'step': 5, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:56.607459', 'step': 5, 'epoch': 1} {'type': 'loss', 'content': 0.12128707021474838, 'timestamp': '2025-09-15 03:18:56.609387', 'step': 6, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:18:56.638998', 'step': 6, 'epoch': 1} {'type': 'loss', 'content': 0.12236928939819336, 'timestamp': '2025-09-15 03:18:56.641222', 'step': 7, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:56.671055', 'step': 7, 'epoch': 1} {'type': 'loss', 'content': 0.13365435600280762, 'timestamp': '2025-09-15 03:18:56.694617', 'step': 8, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:56.725056', 'step': 8, 'epoch': 1} {'type': 'loss', 'content': 0.006437270902097225, 'timestamp': '2025-09-15 03:18:56.727266', 'step': 9, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:56.756541', 'step': 9, 'epoch': 1} {'type': 'loss', 'content': 0.04929186776280403, 'timestamp': '2025-09-15 03:18:56.758751', 'step': 10, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:56.788831', 'step': 10, 'epoch': 1} {'type': 'loss', 'content': 0.024710940197110176, 'timestamp': '2025-09-15 03:18:56.790973', 'step': 11, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:56.820971', 'step': 11, 'epoch': 1} {'type': 'loss', 'content': 0.007417821791023016, 'timestamp': '2025-09-15 03:18:56.844554', 'step': 12, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:56.875236', 'step': 12, 'epoch': 1} {'type': 'loss', 'content': 0.021830081939697266, 'timestamp': '2025-09-15 03:18:56.877306', 'step': 13, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:56.906637', 'step': 13, 'epoch': 1} {'type': 'loss', 'content': 0.035914890468120575, 'timestamp': '2025-09-15 03:18:56.908599', 'step': 14, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:56.938129', 'step': 14, 'epoch': 1} {'type': 'loss', 'content': 0.02858211100101471, 'timestamp': '2025-09-15 03:18:56.940300', 'step': 15, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:56.970164', 'step': 15, 'epoch': 1} {'type': 'loss', 'content': 0.021775051951408386, 'timestamp': '2025-09-15 03:18:56.993512', 'step': 16, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:57.023220', 'step': 16, 'epoch': 1} {'type': 'loss', 'content': 0.052091244608163834, 'timestamp': '2025-09-15 03:18:57.025256', 'step': 17, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:57.056868', 'step': 17, 'epoch': 1} {'type': 'loss', 'content': 0.04736480861902237, 'timestamp': '2025-09-15 03:18:57.059994', 'step': 18, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:18:57.089801', 'step': 18, 'epoch': 1} {'type': 'loss', 'content': 0.03423725813627243, 'timestamp': '2025-09-15 03:18:57.092033', 'step': 19, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:57.121772', 'step': 19, 'epoch': 1} {'type': 'loss', 'content': 0.0331081748008728, 'timestamp': '2025-09-15 03:18:57.145160', 'step': 20, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:18:57.176424', 'step': 20, 'epoch': 1} {'type': 'loss', 'content': 0.03887788578867912, 'timestamp': '2025-09-15 03:18:57.178482', 'step': 21, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:57.208450', 'step': 21, 'epoch': 1} {'type': 'loss', 'content': 0.042407404631376266, 'timestamp': '2025-09-15 03:18:57.210398', 'step': 22, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:57.239886', 'step': 22, 'epoch': 1} {'type': 'loss', 'content': 0.02546323463320732, 'timestamp': '2025-09-15 03:18:57.242010', 'step': 23, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:57.271801', 'step': 23, 'epoch': 1} {'type': 'loss', 'content': 0.02281245030462742, 'timestamp': '2025-09-15 03:18:57.295299', 'step': 24, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:57.325592', 'step': 24, 'epoch': 1} {'type': 'loss', 'content': 0.024779021739959717, 'timestamp': '2025-09-15 03:18:57.327527', 'step': 25, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:18:57.357749', 'step': 25, 'epoch': 1} {'type': 'loss', 'content': 0.04004126042127609, 'timestamp': '2025-09-15 03:18:57.360029', 'step': 26, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:18:57.390858', 'step': 26, 'epoch': 1} {'type': 'loss', 'content': 0.022673314437270164, 'timestamp': '2025-09-15 03:18:57.392930', 'step': 27, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:57.423507', 'step': 27, 'epoch': 1} {'type': 'loss', 'content': 0.026302272453904152, 'timestamp': '2025-09-15 03:18:57.447028', 'step': 28, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:57.477613', 'step': 28, 'epoch': 1} {'type': 'loss', 'content': 0.03431862220168114, 'timestamp': '2025-09-15 03:18:57.479784', 'step': 29, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:57.509218', 'step': 29, 'epoch': 1} {'type': 'loss', 'content': 0.02185298502445221, 'timestamp': '2025-09-15 03:18:57.511438', 'step': 30, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:57.541511', 'step': 30, 'epoch': 1} {'type': 'loss', 'content': 0.022062508389353752, 'timestamp': '2025-09-15 03:18:57.543578', 'step': 31, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:57.573772', 'step': 31, 'epoch': 1} {'type': 'loss', 'content': 0.021942496299743652, 'timestamp': '2025-09-15 03:18:57.597392', 'step': 32, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:57.627701', 'step': 32, 'epoch': 1} {'type': 'loss', 'content': 0.01732688769698143, 'timestamp': '2025-09-15 03:18:57.629783', 'step': 33, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:57.659704', 'step': 33, 'epoch': 1} {'type': 'loss', 'content': 0.031228866428136826, 'timestamp': '2025-09-15 03:18:57.661750', 'step': 34, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:57.691609', 'step': 34, 'epoch': 1} {'type': 'loss', 'content': 0.01614983379840851, 'timestamp': '2025-09-15 03:18:57.693798', 'step': 35, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:57.723941', 'step': 35, 'epoch': 1} {'type': 'loss', 'content': 0.019677946344017982, 'timestamp': '2025-09-15 03:18:57.747440', 'step': 36, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:57.777010', 'step': 36, 'epoch': 1} {'type': 'loss', 'content': 0.021435057744383812, 'timestamp': '2025-09-15 03:18:57.779236', 'step': 37, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:57.809337', 'step': 37, 'epoch': 1} {'type': 'loss', 'content': 0.020535405725240707, 'timestamp': '2025-09-15 03:18:57.811466', 'step': 38, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:18:57.841257', 'step': 38, 'epoch': 1} {'type': 'loss', 'content': 0.016418244689702988, 'timestamp': '2025-09-15 03:18:57.843311', 'step': 39, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:57.872971', 'step': 39, 'epoch': 1} {'type': 'loss', 'content': 0.017667364329099655, 'timestamp': '2025-09-15 03:18:57.896459', 'step': 40, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:57.926342', 'step': 40, 'epoch': 1} {'type': 'loss', 'content': 0.011035345494747162, 'timestamp': '2025-09-15 03:18:57.928424', 'step': 41, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:18:57.958599', 'step': 41, 'epoch': 1} {'type': 'loss', 'content': 0.017797647044062614, 'timestamp': '2025-09-15 03:18:57.960698', 'step': 42, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:57.990571', 'step': 42, 'epoch': 1} {'type': 'loss', 'content': 0.031567107886075974, 'timestamp': '2025-09-15 03:18:57.992465', 'step': 43, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:18:58.038881', 'step': 43, 'epoch': 1} {'type': 'loss', 'content': 0.011192579753696918, 'timestamp': '2025-09-15 03:18:58.062238', 'step': 44, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:58.091817', 'step': 44, 'epoch': 1} {'type': 'loss', 'content': 0.007767003960907459, 'timestamp': '2025-09-15 03:18:58.093999', 'step': 45, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:58.123652', 'step': 45, 'epoch': 1} {'type': 'loss', 'content': 0.0076360017992556095, 'timestamp': '2025-09-15 03:18:58.125689', 'step': 46, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:58.155072', 'step': 46, 'epoch': 1} {'type': 'loss', 'content': 0.0363575778901577, 'timestamp': '2025-09-15 03:18:58.157084', 'step': 47, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:58.186806', 'step': 47, 'epoch': 1} {'type': 'loss', 'content': 0.0191658865660429, 'timestamp': '2025-09-15 03:18:58.210638', 'step': 48, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:58.240693', 'step': 48, 'epoch': 1} {'type': 'loss', 'content': 0.004426487721502781, 'timestamp': '2025-09-15 03:18:58.242702', 'step': 49, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:18:58.272343', 'step': 49, 'epoch': 1} {'type': 'loss', 'content': 0.03847786784172058, 'timestamp': '2025-09-15 03:18:58.275434', 'step': 50, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:58.305274', 'step': 50, 'epoch': 1} {'type': 'loss', 'content': 0.00494822021573782, 'timestamp': '2025-09-15 03:18:58.307646', 'step': 51, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:58.337515', 'step': 51, 'epoch': 1} {'type': 'loss', 'content': 0.034384578466415405, 'timestamp': '2025-09-15 03:18:58.361238', 'step': 52, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:58.391506', 'step': 52, 'epoch': 1} {'type': 'loss', 'content': 0.03528139740228653, 'timestamp': '2025-09-15 03:18:58.393553', 'step': 53, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:18:58.423308', 'step': 53, 'epoch': 1} {'type': 'loss', 'content': 0.0411178395152092, 'timestamp': '2025-09-15 03:18:58.425449', 'step': 54, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:18:58.456132', 'step': 54, 'epoch': 1} {'type': 'loss', 'content': 0.02395920641720295, 'timestamp': '2025-09-15 03:18:58.458396', 'step': 55, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:18:58.488216', 'step': 55, 'epoch': 1} {'type': 'loss', 'content': 0.04303309693932533, 'timestamp': '2025-09-15 03:18:58.511806', 'step': 56, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:58.542198', 'step': 56, 'epoch': 1} {'type': 'loss', 'content': 0.024037647992372513, 'timestamp': '2025-09-15 03:18:58.544207', 'step': 57, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:18:59.259705', 'step': 57, 'epoch': 1} {'type': 'pplx', 'content': 77745310.38449234, 'timestamp': '2025-09-15 03:18:59.261544', 'step': 57, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:59.290185', 'step': 57, 'epoch': 1} {'type': 'loss', 'content': 0.021767010912299156, 'timestamp': '2025-09-15 03:18:59.292295', 'step': 58, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:59.322380', 'step': 58, 'epoch': 1} {'type': 'loss', 'content': 0.017204873263835907, 'timestamp': '2025-09-15 03:18:59.324535', 'step': 59, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:59.354386', 'step': 59, 'epoch': 1} {'type': 'loss', 'content': 0.020821845158934593, 'timestamp': '2025-09-15 03:18:59.377913', 'step': 60, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:59.407692', 'step': 60, 'epoch': 1} {'type': 'loss', 'content': 0.018509017303586006, 'timestamp': '2025-09-15 03:18:59.409984', 'step': 61, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:18:59.439863', 'step': 61, 'epoch': 1} {'type': 'loss', 'content': 0.024281423538923264, 'timestamp': '2025-09-15 03:18:59.442025', 'step': 62, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:18:59.472179', 'step': 62, 'epoch': 1} {'type': 'loss', 'content': 0.008048784919083118, 'timestamp': '2025-09-15 03:18:59.474462', 'step': 63, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:59.504248', 'step': 63, 'epoch': 1} {'type': 'loss', 'content': 0.007551565300673246, 'timestamp': '2025-09-15 03:18:59.527837', 'step': 64, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:59.557797', 'step': 64, 'epoch': 1} {'type': 'loss', 'content': 0.010596475563943386, 'timestamp': '2025-09-15 03:18:59.560082', 'step': 65, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:18:59.589733', 'step': 65, 'epoch': 1} {'type': 'loss', 'content': 0.02982070855796337, 'timestamp': '2025-09-15 03:18:59.592639', 'step': 66, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:59.622117', 'step': 66, 'epoch': 1} {'type': 'loss', 'content': 0.012310145422816277, 'timestamp': '2025-09-15 03:18:59.624515', 'step': 67, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:59.654600', 'step': 67, 'epoch': 1} {'type': 'loss', 'content': 0.027565743774175644, 'timestamp': '2025-09-15 03:18:59.678557', 'step': 68, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:18:59.708673', 'step': 68, 'epoch': 1} {'type': 'loss', 'content': 0.029646441340446472, 'timestamp': '2025-09-15 03:18:59.710856', 'step': 69, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:59.740855', 'step': 69, 'epoch': 1} {'type': 'loss', 'content': 0.017770150676369667, 'timestamp': '2025-09-15 03:18:59.743111', 'step': 70, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:59.773170', 'step': 70, 'epoch': 1} {'type': 'loss', 'content': 0.013608084991574287, 'timestamp': '2025-09-15 03:18:59.775303', 'step': 71, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:59.804981', 'step': 71, 'epoch': 1} {'type': 'loss', 'content': 0.031161179766058922, 'timestamp': '2025-09-15 03:18:59.828472', 'step': 72, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:18:59.858047', 'step': 72, 'epoch': 1} {'type': 'loss', 'content': 0.019617022946476936, 'timestamp': '2025-09-15 03:18:59.860208', 'step': 73, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:59.889161', 'step': 73, 'epoch': 1} {'type': 'loss', 'content': 0.02784857526421547, 'timestamp': '2025-09-15 03:18:59.891174', 'step': 74, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:18:59.920539', 'step': 74, 'epoch': 1} {'type': 'loss', 'content': 0.016915084794163704, 'timestamp': '2025-09-15 03:18:59.922942', 'step': 75, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:18:59.952599', 'step': 75, 'epoch': 1} {'type': 'loss', 'content': 0.021168315783143044, 'timestamp': '2025-09-15 03:18:59.976206', 'step': 76, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:00.006233', 'step': 76, 'epoch': 1} {'type': 'loss', 'content': 0.019576409831643105, 'timestamp': '2025-09-15 03:19:00.008352', 'step': 77, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:00.038249', 'step': 77, 'epoch': 1} {'type': 'loss', 'content': 0.02989226020872593, 'timestamp': '2025-09-15 03:19:00.040376', 'step': 78, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:00.070001', 'step': 78, 'epoch': 1} {'type': 'loss', 'content': 0.029687274247407913, 'timestamp': '2025-09-15 03:19:00.072294', 'step': 79, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:00.101998', 'step': 79, 'epoch': 1} {'type': 'loss', 'content': 0.022338515147566795, 'timestamp': '2025-09-15 03:19:00.125621', 'step': 80, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:00.155569', 'step': 80, 'epoch': 1} {'type': 'loss', 'content': 0.02291913703083992, 'timestamp': '2025-09-15 03:19:00.157493', 'step': 81, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:00.186461', 'step': 81, 'epoch': 1} {'type': 'loss', 'content': 0.022767139598727226, 'timestamp': '2025-09-15 03:19:00.188649', 'step': 82, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:00.218787', 'step': 82, 'epoch': 1} {'type': 'loss', 'content': 0.027573389932513237, 'timestamp': '2025-09-15 03:19:00.221168', 'step': 83, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:00.250784', 'step': 83, 'epoch': 1} {'type': 'loss', 'content': 0.02591836452484131, 'timestamp': '2025-09-15 03:19:00.274430', 'step': 84, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:00.316999', 'step': 84, 'epoch': 1} {'type': 'loss', 'content': 0.022715887054800987, 'timestamp': '2025-09-15 03:19:00.318996', 'step': 85, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:00.348721', 'step': 85, 'epoch': 1} {'type': 'loss', 'content': 0.021058840677142143, 'timestamp': '2025-09-15 03:19:00.350847', 'step': 86, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:00.380696', 'step': 86, 'epoch': 1} {'type': 'loss', 'content': 0.017032312229275703, 'timestamp': '2025-09-15 03:19:00.382764', 'step': 87, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:00.412499', 'step': 87, 'epoch': 1} {'type': 'loss', 'content': 0.020295526832342148, 'timestamp': '2025-09-15 03:19:00.435875', 'step': 88, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:00.465171', 'step': 88, 'epoch': 1} {'type': 'loss', 'content': 0.02098015323281288, 'timestamp': '2025-09-15 03:19:00.467078', 'step': 89, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:00.498950', 'step': 89, 'epoch': 1} {'type': 'loss', 'content': 0.02113400027155876, 'timestamp': '2025-09-15 03:19:00.501326', 'step': 90, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:00.532873', 'step': 90, 'epoch': 1} {'type': 'loss', 'content': 0.022542644292116165, 'timestamp': '2025-09-15 03:19:00.534877', 'step': 91, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:00.564346', 'step': 91, 'epoch': 1} {'type': 'loss', 'content': 0.025110268965363503, 'timestamp': '2025-09-15 03:19:00.587847', 'step': 92, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:00.617394', 'step': 92, 'epoch': 1} {'type': 'loss', 'content': 0.017810607329010963, 'timestamp': '2025-09-15 03:19:00.619471', 'step': 93, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:00.649640', 'step': 93, 'epoch': 1} {'type': 'loss', 'content': 0.024205828085541725, 'timestamp': '2025-09-15 03:19:00.651663', 'step': 94, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:00.681964', 'step': 94, 'epoch': 1} {'type': 'loss', 'content': 0.019303206354379654, 'timestamp': '2025-09-15 03:19:00.683948', 'step': 95, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:00.713315', 'step': 95, 'epoch': 1} {'type': 'loss', 'content': 0.02511030063033104, 'timestamp': '2025-09-15 03:19:00.736847', 'step': 96, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:00.766897', 'step': 96, 'epoch': 1} {'type': 'loss', 'content': 0.018995894119143486, 'timestamp': '2025-09-15 03:19:00.769058', 'step': 97, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:00.799028', 'step': 97, 'epoch': 1} {'type': 'loss', 'content': 0.022378621622920036, 'timestamp': '2025-09-15 03:19:00.801181', 'step': 98, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:00.831567', 'step': 98, 'epoch': 1} {'type': 'loss', 'content': 0.01698601059615612, 'timestamp': '2025-09-15 03:19:00.833574', 'step': 99, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:00.863219', 'step': 99, 'epoch': 1} {'type': 'loss', 'content': 0.016753217205405235, 'timestamp': '2025-09-15 03:19:00.886611', 'step': 100, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:00.916543', 'step': 100, 'epoch': 1} {'type': 'loss', 'content': 0.01952139474451542, 'timestamp': '2025-09-15 03:19:00.918792', 'step': 101, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:00.948979', 'step': 101, 'epoch': 1} {'type': 'loss', 'content': 0.018787041306495667, 'timestamp': '2025-09-15 03:19:00.951249', 'step': 102, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:00.980867', 'step': 102, 'epoch': 1} {'type': 'loss', 'content': 0.020538141950964928, 'timestamp': '2025-09-15 03:19:00.982910', 'step': 103, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:01.011728', 'step': 103, 'epoch': 1} {'type': 'loss', 'content': 0.03242240101099014, 'timestamp': '2025-09-15 03:19:01.035422', 'step': 104, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:01.067236', 'step': 104, 'epoch': 1} {'type': 'loss', 'content': 0.03287802264094353, 'timestamp': '2025-09-15 03:19:01.069139', 'step': 105, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:01.098994', 'step': 105, 'epoch': 1} {'type': 'loss', 'content': 0.007900225929915905, 'timestamp': '2025-09-15 03:19:01.101346', 'step': 106, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:01.131902', 'step': 106, 'epoch': 1} {'type': 'loss', 'content': 0.02327314205467701, 'timestamp': '2025-09-15 03:19:01.134861', 'step': 107, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:01.164783', 'step': 107, 'epoch': 1} {'type': 'loss', 'content': 0.04384846240282059, 'timestamp': '2025-09-15 03:19:01.188312', 'step': 108, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:01.217910', 'step': 108, 'epoch': 1} {'type': 'loss', 'content': 0.0458383746445179, 'timestamp': '2025-09-15 03:19:01.219966', 'step': 109, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:19:01.259718', 'step': 109, 'epoch': 1} {'type': 'loss', 'content': 0.039285700768232346, 'timestamp': '2025-09-15 03:19:01.261751', 'step': 110, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:01.292034', 'step': 110, 'epoch': 1} {'type': 'loss', 'content': 0.007816492579877377, 'timestamp': '2025-09-15 03:19:01.293996', 'step': 111, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:01.324128', 'step': 111, 'epoch': 1} {'type': 'loss', 'content': 0.03153624385595322, 'timestamp': '2025-09-15 03:19:01.347768', 'step': 112, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:01.377623', 'step': 112, 'epoch': 1} {'type': 'loss', 'content': 0.028779255226254463, 'timestamp': '2025-09-15 03:19:01.380014', 'step': 113, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:01.409598', 'step': 113, 'epoch': 1} {'type': 'loss', 'content': 0.024018559604883194, 'timestamp': '2025-09-15 03:19:01.411698', 'step': 114, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:19:02.174304', 'step': 114, 'epoch': 1} {'type': 'pplx', 'content': 86973788.16578543, 'timestamp': '2025-09-15 03:19:02.176690', 'step': 114, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:02.205542', 'step': 114, 'epoch': 1} {'type': 'loss', 'content': 0.008355635218322277, 'timestamp': '2025-09-15 03:19:02.208474', 'step': 115, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:02.238538', 'step': 115, 'epoch': 1} {'type': 'loss', 'content': 0.032105229794979095, 'timestamp': '2025-09-15 03:19:02.262149', 'step': 116, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:02.296821', 'step': 116, 'epoch': 1} {'type': 'loss', 'content': 0.019809599965810776, 'timestamp': '2025-09-15 03:19:02.299717', 'step': 117, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:02.331551', 'step': 117, 'epoch': 1} {'type': 'loss', 'content': 0.018696505576372147, 'timestamp': '2025-09-15 03:19:02.336465', 'step': 118, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:02.368402', 'step': 118, 'epoch': 1} {'type': 'loss', 'content': 0.016046470031142235, 'timestamp': '2025-09-15 03:19:02.370728', 'step': 119, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:02.402126', 'step': 119, 'epoch': 1} {'type': 'loss', 'content': 0.020479975268244743, 'timestamp': '2025-09-15 03:19:02.431958', 'step': 120, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:02.461857', 'step': 120, 'epoch': 1} {'type': 'loss', 'content': 0.03392161801457405, 'timestamp': '2025-09-15 03:19:02.464100', 'step': 121, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:02.493657', 'step': 121, 'epoch': 1} {'type': 'loss', 'content': 0.022408468648791313, 'timestamp': '2025-09-15 03:19:02.495858', 'step': 122, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:02.525604', 'step': 122, 'epoch': 1} {'type': 'loss', 'content': 0.022227082401514053, 'timestamp': '2025-09-15 03:19:02.529820', 'step': 123, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:02.559977', 'step': 123, 'epoch': 1} {'type': 'loss', 'content': 0.01427256129682064, 'timestamp': '2025-09-15 03:19:02.583505', 'step': 124, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:02.613343', 'step': 124, 'epoch': 1} {'type': 'loss', 'content': 0.019987378269433975, 'timestamp': '2025-09-15 03:19:02.615379', 'step': 125, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:02.644606', 'step': 125, 'epoch': 1} {'type': 'loss', 'content': 0.031713634729385376, 'timestamp': '2025-09-15 03:19:02.646783', 'step': 126, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:02.676253', 'step': 126, 'epoch': 1} {'type': 'loss', 'content': 0.01686117984354496, 'timestamp': '2025-09-15 03:19:02.678652', 'step': 127, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:02.712067', 'step': 127, 'epoch': 1} {'type': 'loss', 'content': 0.02143436297774315, 'timestamp': '2025-09-15 03:19:02.735628', 'step': 128, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:02.767720', 'step': 128, 'epoch': 1} {'type': 'loss', 'content': 0.022101953625679016, 'timestamp': '2025-09-15 03:19:02.769787', 'step': 129, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:02.800320', 'step': 129, 'epoch': 1} {'type': 'loss', 'content': 0.026244450360536575, 'timestamp': '2025-09-15 03:19:02.802437', 'step': 130, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:02.833505', 'step': 130, 'epoch': 1} {'type': 'loss', 'content': 0.02726527489721775, 'timestamp': '2025-09-15 03:19:02.835688', 'step': 131, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:02.866492', 'step': 131, 'epoch': 1} {'type': 'loss', 'content': 0.02093103528022766, 'timestamp': '2025-09-15 03:19:02.890041', 'step': 132, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:02.920009', 'step': 132, 'epoch': 1} {'type': 'loss', 'content': 0.016711866483092308, 'timestamp': '2025-09-15 03:19:02.922125', 'step': 133, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:02.952320', 'step': 133, 'epoch': 1} {'type': 'loss', 'content': 0.02590050920844078, 'timestamp': '2025-09-15 03:19:02.954478', 'step': 134, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:02.983972', 'step': 134, 'epoch': 1} {'type': 'loss', 'content': 0.022663580253720284, 'timestamp': '2025-09-15 03:19:02.986239', 'step': 135, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:03.016447', 'step': 135, 'epoch': 1} {'type': 'loss', 'content': 0.020828474313020706, 'timestamp': '2025-09-15 03:19:03.040014', 'step': 136, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:03.069773', 'step': 136, 'epoch': 1} {'type': 'loss', 'content': 0.02339036390185356, 'timestamp': '2025-09-15 03:19:03.072113', 'step': 137, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:03.102165', 'step': 137, 'epoch': 1} {'type': 'loss', 'content': 0.02548803947865963, 'timestamp': '2025-09-15 03:19:03.104408', 'step': 138, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:03.134903', 'step': 138, 'epoch': 1} {'type': 'loss', 'content': 0.01995263621211052, 'timestamp': '2025-09-15 03:19:03.138022', 'step': 139, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:03.168390', 'step': 139, 'epoch': 1} {'type': 'loss', 'content': 0.024372192099690437, 'timestamp': '2025-09-15 03:19:03.192161', 'step': 140, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:03.222149', 'step': 140, 'epoch': 1} {'type': 'loss', 'content': 0.019881976768374443, 'timestamp': '2025-09-15 03:19:03.224545', 'step': 141, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:03.254335', 'step': 141, 'epoch': 1} {'type': 'loss', 'content': 0.022397944703698158, 'timestamp': '2025-09-15 03:19:03.256524', 'step': 142, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:03.286144', 'step': 142, 'epoch': 1} {'type': 'loss', 'content': 0.024667898193001747, 'timestamp': '2025-09-15 03:19:03.288521', 'step': 143, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:03.319233', 'step': 143, 'epoch': 1} {'type': 'loss', 'content': 0.017718922346830368, 'timestamp': '2025-09-15 03:19:03.342865', 'step': 144, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:03.373055', 'step': 144, 'epoch': 1} {'type': 'loss', 'content': 0.01860089972615242, 'timestamp': '2025-09-15 03:19:03.375029', 'step': 145, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:03.405805', 'step': 145, 'epoch': 1} {'type': 'loss', 'content': 0.015553503297269344, 'timestamp': '2025-09-15 03:19:03.408018', 'step': 146, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:03.438209', 'step': 146, 'epoch': 1} {'type': 'loss', 'content': 0.021523339673876762, 'timestamp': '2025-09-15 03:19:03.441331', 'step': 147, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:03.471154', 'step': 147, 'epoch': 1} {'type': 'loss', 'content': 0.028602078557014465, 'timestamp': '2025-09-15 03:19:03.494609', 'step': 148, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:03.524627', 'step': 148, 'epoch': 1} {'type': 'loss', 'content': 0.03364170342683792, 'timestamp': '2025-09-15 03:19:03.526662', 'step': 149, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:03.556699', 'step': 149, 'epoch': 1} {'type': 'loss', 'content': 0.03490597754716873, 'timestamp': '2025-09-15 03:19:03.559392', 'step': 150, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:03.589524', 'step': 150, 'epoch': 1} {'type': 'loss', 'content': 0.018360691145062447, 'timestamp': '2025-09-15 03:19:03.591671', 'step': 151, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:03.621961', 'step': 151, 'epoch': 1} {'type': 'loss', 'content': 0.013131016865372658, 'timestamp': '2025-09-15 03:19:03.645413', 'step': 152, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:03.675118', 'step': 152, 'epoch': 1} {'type': 'loss', 'content': 0.014045425690710545, 'timestamp': '2025-09-15 03:19:03.677261', 'step': 153, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:03.707004', 'step': 153, 'epoch': 1} {'type': 'loss', 'content': 0.02355443872511387, 'timestamp': '2025-09-15 03:19:03.709315', 'step': 154, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:03.739404', 'step': 154, 'epoch': 1} {'type': 'loss', 'content': 0.0261994656175375, 'timestamp': '2025-09-15 03:19:03.741490', 'step': 155, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:03.772886', 'step': 155, 'epoch': 1} {'type': 'loss', 'content': 0.020213531330227852, 'timestamp': '2025-09-15 03:19:03.796337', 'step': 156, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:03.826456', 'step': 156, 'epoch': 1} {'type': 'loss', 'content': 0.038094911724328995, 'timestamp': '2025-09-15 03:19:03.828647', 'step': 157, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:03.857927', 'step': 157, 'epoch': 1} {'type': 'loss', 'content': 0.04792099818587303, 'timestamp': '2025-09-15 03:19:03.860142', 'step': 158, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:03.889704', 'step': 158, 'epoch': 1} {'type': 'loss', 'content': 0.03926343098282814, 'timestamp': '2025-09-15 03:19:03.891850', 'step': 159, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:03.921661', 'step': 159, 'epoch': 1} {'type': 'loss', 'content': 0.032103825360536575, 'timestamp': '2025-09-15 03:19:03.945243', 'step': 160, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:03.975950', 'step': 160, 'epoch': 1} {'type': 'loss', 'content': 0.02103709802031517, 'timestamp': '2025-09-15 03:19:03.978127', 'step': 161, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:04.007608', 'step': 161, 'epoch': 1} {'type': 'loss', 'content': 0.029447833076119423, 'timestamp': '2025-09-15 03:19:04.009812', 'step': 162, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:04.040241', 'step': 162, 'epoch': 1} {'type': 'loss', 'content': 0.023078102618455887, 'timestamp': '2025-09-15 03:19:04.042501', 'step': 163, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:04.072505', 'step': 163, 'epoch': 1} {'type': 'loss', 'content': 0.017872294411063194, 'timestamp': '2025-09-15 03:19:04.096077', 'step': 164, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:04.125768', 'step': 164, 'epoch': 1} {'type': 'loss', 'content': 0.02360287867486477, 'timestamp': '2025-09-15 03:19:04.128019', 'step': 165, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:04.157806', 'step': 165, 'epoch': 1} {'type': 'loss', 'content': 0.02339279279112816, 'timestamp': '2025-09-15 03:19:04.160109', 'step': 166, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:04.190157', 'step': 166, 'epoch': 1} {'type': 'loss', 'content': 0.02308400347828865, 'timestamp': '2025-09-15 03:19:04.192101', 'step': 167, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:04.221876', 'step': 167, 'epoch': 1} {'type': 'loss', 'content': 0.014306237921118736, 'timestamp': '2025-09-15 03:19:04.245401', 'step': 168, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:04.275219', 'step': 168, 'epoch': 1} {'type': 'loss', 'content': 0.02043766900897026, 'timestamp': '2025-09-15 03:19:04.277425', 'step': 169, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:04.307068', 'step': 169, 'epoch': 1} {'type': 'loss', 'content': 0.022694125771522522, 'timestamp': '2025-09-15 03:19:04.309160', 'step': 170, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:04.339510', 'step': 170, 'epoch': 1} {'type': 'loss', 'content': 0.018936406821012497, 'timestamp': '2025-09-15 03:19:04.341812', 'step': 171, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:19:05.057952', 'step': 171, 'epoch': 1} {'type': 'pplx', 'content': 91425276.58815499, 'timestamp': '2025-09-15 03:19:05.061498', 'step': 171, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:05.092818', 'step': 171, 'epoch': 1} {'type': 'loss', 'content': 0.026795899495482445, 'timestamp': '2025-09-15 03:19:05.116285', 'step': 172, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:05.146417', 'step': 172, 'epoch': 1} {'type': 'loss', 'content': 0.020205063745379448, 'timestamp': '2025-09-15 03:19:05.148373', 'step': 173, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:05.177984', 'step': 173, 'epoch': 1} {'type': 'loss', 'content': 0.03856884315609932, 'timestamp': '2025-09-15 03:19:05.180286', 'step': 174, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:05.209930', 'step': 174, 'epoch': 1} {'type': 'loss', 'content': 0.013646108098328114, 'timestamp': '2025-09-15 03:19:05.212098', 'step': 175, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:05.241825', 'step': 175, 'epoch': 1} {'type': 'loss', 'content': 0.021882707253098488, 'timestamp': '2025-09-15 03:19:05.265174', 'step': 176, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:05.294597', 'step': 176, 'epoch': 1} {'type': 'loss', 'content': 0.014430088922381401, 'timestamp': '2025-09-15 03:19:05.296720', 'step': 177, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:05.329726', 'step': 177, 'epoch': 1} {'type': 'loss', 'content': 0.01406953576952219, 'timestamp': '2025-09-15 03:19:05.332206', 'step': 178, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:05.366219', 'step': 178, 'epoch': 1} {'type': 'loss', 'content': 0.011970186606049538, 'timestamp': '2025-09-15 03:19:05.368305', 'step': 179, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:05.397902', 'step': 179, 'epoch': 1} {'type': 'loss', 'content': 0.027014048770070076, 'timestamp': '2025-09-15 03:19:05.421672', 'step': 180, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:05.452003', 'step': 180, 'epoch': 1} {'type': 'loss', 'content': 0.022636890411376953, 'timestamp': '2025-09-15 03:19:05.454310', 'step': 181, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:05.483868', 'step': 181, 'epoch': 1} {'type': 'loss', 'content': 0.027193518355488777, 'timestamp': '2025-09-15 03:19:05.485894', 'step': 182, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:05.516030', 'step': 182, 'epoch': 1} {'type': 'loss', 'content': 0.04116543009877205, 'timestamp': '2025-09-15 03:19:05.518095', 'step': 183, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:05.547843', 'step': 183, 'epoch': 1} {'type': 'loss', 'content': 0.03132352605462074, 'timestamp': '2025-09-15 03:19:05.571327', 'step': 184, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:05.601250', 'step': 184, 'epoch': 1} {'type': 'loss', 'content': 0.02800433151423931, 'timestamp': '2025-09-15 03:19:05.603294', 'step': 185, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:05.633329', 'step': 185, 'epoch': 1} {'type': 'loss', 'content': 0.021477000787854195, 'timestamp': '2025-09-15 03:19:05.635508', 'step': 186, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:05.665347', 'step': 186, 'epoch': 1} {'type': 'loss', 'content': 0.028118086978793144, 'timestamp': '2025-09-15 03:19:05.669034', 'step': 187, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:05.698265', 'step': 187, 'epoch': 1} {'type': 'loss', 'content': 0.021604614332318306, 'timestamp': '2025-09-15 03:19:05.721675', 'step': 188, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:05.751500', 'step': 188, 'epoch': 1} {'type': 'loss', 'content': 0.01969420723617077, 'timestamp': '2025-09-15 03:19:05.753676', 'step': 189, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:05.783885', 'step': 189, 'epoch': 1} {'type': 'loss', 'content': 0.02482091449201107, 'timestamp': '2025-09-15 03:19:05.786206', 'step': 190, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:05.816527', 'step': 190, 'epoch': 1} {'type': 'loss', 'content': 0.02031696029007435, 'timestamp': '2025-09-15 03:19:05.818600', 'step': 191, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:05.848360', 'step': 191, 'epoch': 1} {'type': 'loss', 'content': 0.022723600268363953, 'timestamp': '2025-09-15 03:19:05.871726', 'step': 192, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:05.901894', 'step': 192, 'epoch': 1} {'type': 'loss', 'content': 0.016467997804284096, 'timestamp': '2025-09-15 03:19:05.904127', 'step': 193, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:05.933980', 'step': 193, 'epoch': 1} {'type': 'loss', 'content': 0.015772905200719833, 'timestamp': '2025-09-15 03:19:05.936522', 'step': 194, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:05.966591', 'step': 194, 'epoch': 1} {'type': 'loss', 'content': 0.019026650115847588, 'timestamp': '2025-09-15 03:19:05.968864', 'step': 195, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:06.000085', 'step': 195, 'epoch': 1} {'type': 'loss', 'content': 0.021972985938191414, 'timestamp': '2025-09-15 03:19:06.023661', 'step': 196, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:06.053380', 'step': 196, 'epoch': 1} {'type': 'loss', 'content': 0.015487863682210445, 'timestamp': '2025-09-15 03:19:06.055359', 'step': 197, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:06.088407', 'step': 197, 'epoch': 1} {'type': 'loss', 'content': 0.02404738776385784, 'timestamp': '2025-09-15 03:19:06.090919', 'step': 198, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:06.121158', 'step': 198, 'epoch': 1} {'type': 'loss', 'content': 0.02887124940752983, 'timestamp': '2025-09-15 03:19:06.123202', 'step': 199, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:06.153230', 'step': 199, 'epoch': 1} {'type': 'loss', 'content': 0.030241340398788452, 'timestamp': '2025-09-15 03:19:06.176720', 'step': 200, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:06.206392', 'step': 200, 'epoch': 1} {'type': 'loss', 'content': 0.018082991242408752, 'timestamp': '2025-09-15 03:19:06.208464', 'step': 201, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:06.237849', 'step': 201, 'epoch': 1} {'type': 'loss', 'content': 0.019952043890953064, 'timestamp': '2025-09-15 03:19:06.239745', 'step': 202, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:06.270129', 'step': 202, 'epoch': 1} {'type': 'loss', 'content': 0.022944288328289986, 'timestamp': '2025-09-15 03:19:06.272464', 'step': 203, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:06.302044', 'step': 203, 'epoch': 1} {'type': 'loss', 'content': 0.021453650668263435, 'timestamp': '2025-09-15 03:19:06.325520', 'step': 204, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:06.355312', 'step': 204, 'epoch': 1} {'type': 'loss', 'content': 0.02109239064157009, 'timestamp': '2025-09-15 03:19:06.357433', 'step': 205, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:06.387010', 'step': 205, 'epoch': 1} {'type': 'loss', 'content': 0.013666792772710323, 'timestamp': '2025-09-15 03:19:06.389078', 'step': 206, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:06.419204', 'step': 206, 'epoch': 1} {'type': 'loss', 'content': 0.028260722756385803, 'timestamp': '2025-09-15 03:19:06.421389', 'step': 207, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:06.451056', 'step': 207, 'epoch': 1} {'type': 'loss', 'content': 0.01960393413901329, 'timestamp': '2025-09-15 03:19:06.474604', 'step': 208, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:06.504721', 'step': 208, 'epoch': 1} {'type': 'loss', 'content': 0.02854198031127453, 'timestamp': '2025-09-15 03:19:06.507021', 'step': 209, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:06.536534', 'step': 209, 'epoch': 1} {'type': 'loss', 'content': 0.03184450417757034, 'timestamp': '2025-09-15 03:19:06.539231', 'step': 210, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:06.568389', 'step': 210, 'epoch': 1} {'type': 'loss', 'content': 0.020297560840845108, 'timestamp': '2025-09-15 03:19:06.570422', 'step': 211, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:06.600999', 'step': 211, 'epoch': 1} {'type': 'loss', 'content': 0.034596700221300125, 'timestamp': '2025-09-15 03:19:06.624533', 'step': 212, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:06.654388', 'step': 212, 'epoch': 1} {'type': 'loss', 'content': 0.030675770714879036, 'timestamp': '2025-09-15 03:19:06.656437', 'step': 213, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:06.685852', 'step': 213, 'epoch': 1} {'type': 'loss', 'content': 0.018920252099633217, 'timestamp': '2025-09-15 03:19:06.687976', 'step': 214, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:06.718802', 'step': 214, 'epoch': 1} {'type': 'loss', 'content': 0.026095913723111153, 'timestamp': '2025-09-15 03:19:06.720864', 'step': 215, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:06.750882', 'step': 215, 'epoch': 1} {'type': 'loss', 'content': 0.024462031200528145, 'timestamp': '2025-09-15 03:19:06.774322', 'step': 216, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:06.804840', 'step': 216, 'epoch': 1} {'type': 'loss', 'content': 0.01993633806705475, 'timestamp': '2025-09-15 03:19:06.806844', 'step': 217, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:06.836665', 'step': 217, 'epoch': 1} {'type': 'loss', 'content': 0.022599508985877037, 'timestamp': '2025-09-15 03:19:06.838996', 'step': 218, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:06.868600', 'step': 218, 'epoch': 1} {'type': 'loss', 'content': 0.023072870448231697, 'timestamp': '2025-09-15 03:19:06.870556', 'step': 219, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:06.899769', 'step': 219, 'epoch': 1} {'type': 'loss', 'content': 0.01495459396392107, 'timestamp': '2025-09-15 03:19:06.923135', 'step': 220, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:06.953247', 'step': 220, 'epoch': 1} {'type': 'loss', 'content': 0.020196115598082542, 'timestamp': '2025-09-15 03:19:06.955364', 'step': 221, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:06.984766', 'step': 221, 'epoch': 1} {'type': 'loss', 'content': 0.025984425097703934, 'timestamp': '2025-09-15 03:19:06.987145', 'step': 222, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:07.017052', 'step': 222, 'epoch': 1} {'type': 'loss', 'content': 0.018872182816267014, 'timestamp': '2025-09-15 03:19:07.019204', 'step': 223, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:07.048505', 'step': 223, 'epoch': 1} {'type': 'loss', 'content': 0.02921387180685997, 'timestamp': '2025-09-15 03:19:07.072120', 'step': 224, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:07.102941', 'step': 224, 'epoch': 1} {'type': 'loss', 'content': 0.016917483881115913, 'timestamp': '2025-09-15 03:19:07.105230', 'step': 225, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:07.134819', 'step': 225, 'epoch': 1} {'type': 'loss', 'content': 0.020182834938168526, 'timestamp': '2025-09-15 03:19:07.136924', 'step': 226, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:07.166693', 'step': 226, 'epoch': 1} {'type': 'loss', 'content': 0.02879491075873375, 'timestamp': '2025-09-15 03:19:07.169146', 'step': 227, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:07.198969', 'step': 227, 'epoch': 1} {'type': 'loss', 'content': 0.014776498079299927, 'timestamp': '2025-09-15 03:19:07.222511', 'step': 228, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:19:07.932338', 'step': 228, 'epoch': 1} {'type': 'pplx', 'content': 95328226.03919551, 'timestamp': '2025-09-15 03:19:07.934365', 'step': 228, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:07.962339', 'step': 228, 'epoch': 1} {'type': 'loss', 'content': 0.01956128515303135, 'timestamp': '2025-09-15 03:19:07.964666', 'step': 229, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:07.995542', 'step': 229, 'epoch': 1} {'type': 'loss', 'content': 0.02058500424027443, 'timestamp': '2025-09-15 03:19:07.998761', 'step': 230, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:08.028450', 'step': 230, 'epoch': 1} {'type': 'loss', 'content': 0.021003449335694313, 'timestamp': '2025-09-15 03:19:08.030453', 'step': 231, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:08.060107', 'step': 231, 'epoch': 1} {'type': 'loss', 'content': 0.026520492509007454, 'timestamp': '2025-09-15 03:19:08.083483', 'step': 232, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:08.113321', 'step': 232, 'epoch': 1} {'type': 'loss', 'content': 0.021533282473683357, 'timestamp': '2025-09-15 03:19:08.115783', 'step': 233, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:08.146784', 'step': 233, 'epoch': 1} {'type': 'loss', 'content': 0.021764442324638367, 'timestamp': '2025-09-15 03:19:08.148763', 'step': 234, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:08.180901', 'step': 234, 'epoch': 1} {'type': 'loss', 'content': 0.01905788481235504, 'timestamp': '2025-09-15 03:19:08.183096', 'step': 235, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:08.212805', 'step': 235, 'epoch': 1} {'type': 'loss', 'content': 0.019037913531064987, 'timestamp': '2025-09-15 03:19:08.236329', 'step': 236, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:08.266192', 'step': 236, 'epoch': 1} {'type': 'loss', 'content': 0.020615851506590843, 'timestamp': '2025-09-15 03:19:08.268294', 'step': 237, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:08.298958', 'step': 237, 'epoch': 1} {'type': 'loss', 'content': 0.02502620592713356, 'timestamp': '2025-09-15 03:19:08.302145', 'step': 238, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:08.332058', 'step': 238, 'epoch': 1} {'type': 'loss', 'content': 0.028659885749220848, 'timestamp': '2025-09-15 03:19:08.334193', 'step': 239, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:08.363742', 'step': 239, 'epoch': 1} {'type': 'loss', 'content': 0.02334064617753029, 'timestamp': '2025-09-15 03:19:08.387132', 'step': 240, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:08.417280', 'step': 240, 'epoch': 1} {'type': 'loss', 'content': 0.01678556762635708, 'timestamp': '2025-09-15 03:19:08.419374', 'step': 241, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:08.452364', 'step': 241, 'epoch': 1} {'type': 'loss', 'content': 0.01673532836139202, 'timestamp': '2025-09-15 03:19:08.454476', 'step': 242, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:08.484480', 'step': 242, 'epoch': 1} {'type': 'loss', 'content': 0.015126006677746773, 'timestamp': '2025-09-15 03:19:08.486777', 'step': 243, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:08.516663', 'step': 243, 'epoch': 1} {'type': 'loss', 'content': 0.01618882827460766, 'timestamp': '2025-09-15 03:19:08.540110', 'step': 244, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:08.570018', 'step': 244, 'epoch': 1} {'type': 'loss', 'content': 0.0317390151321888, 'timestamp': '2025-09-15 03:19:08.572522', 'step': 245, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:08.602221', 'step': 245, 'epoch': 1} {'type': 'loss', 'content': 0.028386985883116722, 'timestamp': '2025-09-15 03:19:08.605608', 'step': 246, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:08.635427', 'step': 246, 'epoch': 1} {'type': 'loss', 'content': 0.029143307358026505, 'timestamp': '2025-09-15 03:19:08.637834', 'step': 247, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:08.667261', 'step': 247, 'epoch': 1} {'type': 'loss', 'content': 0.021952202543616295, 'timestamp': '2025-09-15 03:19:08.690738', 'step': 248, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:08.720309', 'step': 248, 'epoch': 1} {'type': 'loss', 'content': 0.03334729000926018, 'timestamp': '2025-09-15 03:19:08.722406', 'step': 249, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:08.752010', 'step': 249, 'epoch': 1} {'type': 'loss', 'content': 0.014858617447316647, 'timestamp': '2025-09-15 03:19:08.754124', 'step': 250, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:08.784404', 'step': 250, 'epoch': 1} {'type': 'loss', 'content': 0.02902458980679512, 'timestamp': '2025-09-15 03:19:08.786665', 'step': 251, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:08.816220', 'step': 251, 'epoch': 1} {'type': 'loss', 'content': 0.01632942259311676, 'timestamp': '2025-09-15 03:19:08.839671', 'step': 252, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:08.869985', 'step': 252, 'epoch': 1} {'type': 'loss', 'content': 0.013802406378090382, 'timestamp': '2025-09-15 03:19:08.872161', 'step': 253, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:08.901560', 'step': 253, 'epoch': 1} {'type': 'loss', 'content': 0.02780589461326599, 'timestamp': '2025-09-15 03:19:08.903841', 'step': 254, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:08.933951', 'step': 254, 'epoch': 1} {'type': 'loss', 'content': 0.013642417266964912, 'timestamp': '2025-09-15 03:19:08.936850', 'step': 255, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:19:08.966968', 'step': 255, 'epoch': 1} {'type': 'loss', 'content': 0.015592672862112522, 'timestamp': '2025-09-15 03:19:08.990658', 'step': 256, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:09.020439', 'step': 256, 'epoch': 1} {'type': 'loss', 'content': 0.01739800162613392, 'timestamp': '2025-09-15 03:19:09.022653', 'step': 257, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:09.052113', 'step': 257, 'epoch': 1} {'type': 'loss', 'content': 0.03306451067328453, 'timestamp': '2025-09-15 03:19:09.054150', 'step': 258, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:09.083625', 'step': 258, 'epoch': 1} {'type': 'loss', 'content': 0.02215319685637951, 'timestamp': '2025-09-15 03:19:09.085969', 'step': 259, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:09.116185', 'step': 259, 'epoch': 1} {'type': 'loss', 'content': 0.02427254430949688, 'timestamp': '2025-09-15 03:19:09.139671', 'step': 260, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:09.169285', 'step': 260, 'epoch': 1} {'type': 'loss', 'content': 0.0260100606828928, 'timestamp': '2025-09-15 03:19:09.171879', 'step': 261, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:09.201415', 'step': 261, 'epoch': 1} {'type': 'loss', 'content': 0.024701233953237534, 'timestamp': '2025-09-15 03:19:09.203584', 'step': 262, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:09.233865', 'step': 262, 'epoch': 1} {'type': 'loss', 'content': 0.02077108435332775, 'timestamp': '2025-09-15 03:19:09.236049', 'step': 263, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:09.265961', 'step': 263, 'epoch': 1} {'type': 'loss', 'content': 0.01494417805224657, 'timestamp': '2025-09-15 03:19:09.289581', 'step': 264, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:09.320083', 'step': 264, 'epoch': 1} {'type': 'loss', 'content': 0.022080078721046448, 'timestamp': '2025-09-15 03:19:09.322399', 'step': 265, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:09.352206', 'step': 265, 'epoch': 1} {'type': 'loss', 'content': 0.020438488572835922, 'timestamp': '2025-09-15 03:19:09.354393', 'step': 266, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:09.384316', 'step': 266, 'epoch': 1} {'type': 'loss', 'content': 0.021945010870695114, 'timestamp': '2025-09-15 03:19:09.386423', 'step': 267, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:09.416017', 'step': 267, 'epoch': 1} {'type': 'loss', 'content': 0.03710794076323509, 'timestamp': '2025-09-15 03:19:09.439621', 'step': 268, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:09.470007', 'step': 268, 'epoch': 1} {'type': 'loss', 'content': 0.015702739357948303, 'timestamp': '2025-09-15 03:19:09.472211', 'step': 269, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:09.501555', 'step': 269, 'epoch': 1} {'type': 'loss', 'content': 0.021318545565009117, 'timestamp': '2025-09-15 03:19:09.503655', 'step': 270, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:09.534917', 'step': 270, 'epoch': 1} {'type': 'loss', 'content': 0.023652182891964912, 'timestamp': '2025-09-15 03:19:09.536996', 'step': 271, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:09.566378', 'step': 271, 'epoch': 1} {'type': 'loss', 'content': 0.018605994060635567, 'timestamp': '2025-09-15 03:19:09.589895', 'step': 272, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:09.619087', 'step': 272, 'epoch': 1} {'type': 'loss', 'content': 0.026582153514027596, 'timestamp': '2025-09-15 03:19:09.621135', 'step': 273, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:09.650842', 'step': 273, 'epoch': 1} {'type': 'loss', 'content': 0.014253231696784496, 'timestamp': '2025-09-15 03:19:09.652951', 'step': 274, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:09.682729', 'step': 274, 'epoch': 1} {'type': 'loss', 'content': 0.019751057028770447, 'timestamp': '2025-09-15 03:19:09.684742', 'step': 275, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:09.714539', 'step': 275, 'epoch': 1} {'type': 'loss', 'content': 0.02657679282128811, 'timestamp': '2025-09-15 03:19:09.738017', 'step': 276, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:09.767917', 'step': 276, 'epoch': 1} {'type': 'loss', 'content': 0.010766022838652134, 'timestamp': '2025-09-15 03:19:09.769933', 'step': 277, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:09.799544', 'step': 277, 'epoch': 1} {'type': 'loss', 'content': 0.029790718108415604, 'timestamp': '2025-09-15 03:19:09.801652', 'step': 278, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:09.831257', 'step': 278, 'epoch': 1} {'type': 'loss', 'content': 0.023627398535609245, 'timestamp': '2025-09-15 03:19:09.834136', 'step': 279, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:09.864068', 'step': 279, 'epoch': 1} {'type': 'loss', 'content': 0.04631868749856949, 'timestamp': '2025-09-15 03:19:09.887689', 'step': 280, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:09.917469', 'step': 280, 'epoch': 1} {'type': 'loss', 'content': 0.017096029594540596, 'timestamp': '2025-09-15 03:19:09.919490', 'step': 281, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:09.950359', 'step': 281, 'epoch': 1} {'type': 'loss', 'content': 0.01746373064815998, 'timestamp': '2025-09-15 03:19:09.952329', 'step': 282, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:09.981577', 'step': 282, 'epoch': 1} {'type': 'loss', 'content': 0.02175072208046913, 'timestamp': '2025-09-15 03:19:09.983607', 'step': 283, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:10.013521', 'step': 283, 'epoch': 1} {'type': 'loss', 'content': 0.0345226414501667, 'timestamp': '2025-09-15 03:19:10.036963', 'step': 284, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:10.066817', 'step': 284, 'epoch': 1} {'type': 'loss', 'content': 0.02375132218003273, 'timestamp': '2025-09-15 03:19:10.068739', 'step': 285, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:19:11.150475', 'step': 285, 'epoch': 1} {'type': 'pplx', 'content': 99808598.98644093, 'timestamp': '2025-09-15 03:19:11.152465', 'step': 285, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:11.194591', 'step': 285, 'epoch': 1} {'type': 'loss', 'content': 0.0117965592071414, 'timestamp': '2025-09-15 03:19:11.196757', 'step': 286, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:19:11.231707', 'step': 286, 'epoch': 1} {'type': 'loss', 'content': 0.018215393647551537, 'timestamp': '2025-09-15 03:19:11.234418', 'step': 287, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:11.266630', 'step': 287, 'epoch': 1} {'type': 'loss', 'content': 0.03157208487391472, 'timestamp': '2025-09-15 03:19:11.290263', 'step': 288, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:11.320396', 'step': 288, 'epoch': 1} {'type': 'loss', 'content': 0.029829951003193855, 'timestamp': '2025-09-15 03:19:11.322561', 'step': 289, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:11.352362', 'step': 289, 'epoch': 1} {'type': 'loss', 'content': 0.01235766801983118, 'timestamp': '2025-09-15 03:19:11.354458', 'step': 290, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:11.384759', 'step': 290, 'epoch': 1} {'type': 'loss', 'content': 0.021873388439416885, 'timestamp': '2025-09-15 03:19:11.387088', 'step': 291, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:11.417122', 'step': 291, 'epoch': 1} {'type': 'loss', 'content': 0.02612815983593464, 'timestamp': '2025-09-15 03:19:11.440806', 'step': 292, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:11.471321', 'step': 292, 'epoch': 1} {'type': 'loss', 'content': 0.01425966527312994, 'timestamp': '2025-09-15 03:19:11.473203', 'step': 293, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:11.516242', 'step': 293, 'epoch': 1} {'type': 'loss', 'content': 0.0279195886105299, 'timestamp': '2025-09-15 03:19:11.518210', 'step': 294, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:11.554934', 'step': 294, 'epoch': 1} {'type': 'loss', 'content': 0.018090257421135902, 'timestamp': '2025-09-15 03:19:11.556949', 'step': 295, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:11.588962', 'step': 295, 'epoch': 1} {'type': 'loss', 'content': 0.014155800454318523, 'timestamp': '2025-09-15 03:19:11.612405', 'step': 296, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:11.643174', 'step': 296, 'epoch': 1} {'type': 'loss', 'content': 0.019184812903404236, 'timestamp': '2025-09-15 03:19:11.645473', 'step': 297, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:11.678940', 'step': 297, 'epoch': 1} {'type': 'loss', 'content': 0.025441085919737816, 'timestamp': '2025-09-15 03:19:11.681103', 'step': 298, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:11.710918', 'step': 298, 'epoch': 1} {'type': 'loss', 'content': 0.015621578320860863, 'timestamp': '2025-09-15 03:19:11.713193', 'step': 299, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:11.743550', 'step': 299, 'epoch': 1} {'type': 'loss', 'content': 0.016921255737543106, 'timestamp': '2025-09-15 03:19:11.767940', 'step': 300, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:11.799824', 'step': 300, 'epoch': 1} {'type': 'loss', 'content': 0.023754924535751343, 'timestamp': '2025-09-15 03:19:11.801941', 'step': 301, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:11.831833', 'step': 301, 'epoch': 1} {'type': 'loss', 'content': 0.013065925799310207, 'timestamp': '2025-09-15 03:19:11.834215', 'step': 302, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:11.864436', 'step': 302, 'epoch': 1} {'type': 'loss', 'content': 0.01167286280542612, 'timestamp': '2025-09-15 03:19:11.866489', 'step': 303, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:11.895878', 'step': 303, 'epoch': 1} {'type': 'loss', 'content': 0.01580006815493107, 'timestamp': '2025-09-15 03:19:11.919994', 'step': 304, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:11.949991', 'step': 304, 'epoch': 1} {'type': 'loss', 'content': 0.0222313292324543, 'timestamp': '2025-09-15 03:19:11.952069', 'step': 305, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:11.981715', 'step': 305, 'epoch': 1} {'type': 'loss', 'content': 0.028958607465028763, 'timestamp': '2025-09-15 03:19:11.983938', 'step': 306, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:12.013152', 'step': 306, 'epoch': 1} {'type': 'loss', 'content': 0.02822468802332878, 'timestamp': '2025-09-15 03:19:12.015280', 'step': 307, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:12.044489', 'step': 307, 'epoch': 1} {'type': 'loss', 'content': 0.019177228212356567, 'timestamp': '2025-09-15 03:19:12.067927', 'step': 308, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:12.097815', 'step': 308, 'epoch': 1} {'type': 'loss', 'content': 0.00797706376761198, 'timestamp': '2025-09-15 03:19:12.100142', 'step': 309, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:12.130005', 'step': 309, 'epoch': 1} {'type': 'loss', 'content': 0.029054300859570503, 'timestamp': '2025-09-15 03:19:12.132173', 'step': 310, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:12.161911', 'step': 310, 'epoch': 1} {'type': 'loss', 'content': 0.029977068305015564, 'timestamp': '2025-09-15 03:19:12.165788', 'step': 311, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:12.196357', 'step': 311, 'epoch': 1} {'type': 'loss', 'content': 0.01888146437704563, 'timestamp': '2025-09-15 03:19:12.220112', 'step': 312, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:12.249610', 'step': 312, 'epoch': 1} {'type': 'loss', 'content': 0.024634139612317085, 'timestamp': '2025-09-15 03:19:12.255327', 'step': 313, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:12.288147', 'step': 313, 'epoch': 1} {'type': 'loss', 'content': 0.014435885474085808, 'timestamp': '2025-09-15 03:19:12.290509', 'step': 314, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:12.322817', 'step': 314, 'epoch': 1} {'type': 'loss', 'content': 0.028926042839884758, 'timestamp': '2025-09-15 03:19:12.324851', 'step': 315, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:12.355228', 'step': 315, 'epoch': 1} {'type': 'loss', 'content': 0.007600546348839998, 'timestamp': '2025-09-15 03:19:12.378821', 'step': 316, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:12.409323', 'step': 316, 'epoch': 1} {'type': 'loss', 'content': 0.010601741261780262, 'timestamp': '2025-09-15 03:19:12.411396', 'step': 317, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:12.441237', 'step': 317, 'epoch': 1} {'type': 'loss', 'content': 0.027423948049545288, 'timestamp': '2025-09-15 03:19:12.443004', 'step': 318, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:12.472957', 'step': 318, 'epoch': 1} {'type': 'loss', 'content': 0.015101133845746517, 'timestamp': '2025-09-15 03:19:12.475029', 'step': 319, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:12.505958', 'step': 319, 'epoch': 1} {'type': 'loss', 'content': 0.020047292113304138, 'timestamp': '2025-09-15 03:19:12.529618', 'step': 320, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:12.559602', 'step': 320, 'epoch': 1} {'type': 'loss', 'content': 0.018711814656853676, 'timestamp': '2025-09-15 03:19:12.561795', 'step': 321, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:12.592324', 'step': 321, 'epoch': 1} {'type': 'loss', 'content': 0.032741669565439224, 'timestamp': '2025-09-15 03:19:12.594579', 'step': 322, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:12.624364', 'step': 322, 'epoch': 1} {'type': 'loss', 'content': 0.029987605288624763, 'timestamp': '2025-09-15 03:19:12.626550', 'step': 323, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:12.656605', 'step': 323, 'epoch': 1} {'type': 'loss', 'content': 0.022992825135588646, 'timestamp': '2025-09-15 03:19:12.680198', 'step': 324, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:12.710073', 'step': 324, 'epoch': 1} {'type': 'loss', 'content': 0.024438347667455673, 'timestamp': '2025-09-15 03:19:12.712122', 'step': 325, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:12.741794', 'step': 325, 'epoch': 1} {'type': 'loss', 'content': 0.024056127294898033, 'timestamp': '2025-09-15 03:19:12.744030', 'step': 326, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:12.773363', 'step': 326, 'epoch': 1} {'type': 'loss', 'content': 0.02588737942278385, 'timestamp': '2025-09-15 03:19:12.775326', 'step': 327, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:19:12.807193', 'step': 327, 'epoch': 1} {'type': 'loss', 'content': 0.023963138461112976, 'timestamp': '2025-09-15 03:19:12.830777', 'step': 328, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:12.861014', 'step': 328, 'epoch': 1} {'type': 'loss', 'content': 0.02214493788778782, 'timestamp': '2025-09-15 03:19:12.863014', 'step': 329, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:12.893138', 'step': 329, 'epoch': 1} {'type': 'loss', 'content': 0.021407917141914368, 'timestamp': '2025-09-15 03:19:12.895126', 'step': 330, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:12.925095', 'step': 330, 'epoch': 1} {'type': 'loss', 'content': 0.020377272740006447, 'timestamp': '2025-09-15 03:19:12.927226', 'step': 331, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:12.957070', 'step': 331, 'epoch': 1} {'type': 'loss', 'content': 0.028564879670739174, 'timestamp': '2025-09-15 03:19:12.980628', 'step': 332, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:13.010328', 'step': 332, 'epoch': 1} {'type': 'loss', 'content': 0.025727489963173866, 'timestamp': '2025-09-15 03:19:13.012523', 'step': 333, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:13.042417', 'step': 333, 'epoch': 1} {'type': 'loss', 'content': 0.02494569681584835, 'timestamp': '2025-09-15 03:19:13.044487', 'step': 334, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:13.074241', 'step': 334, 'epoch': 1} {'type': 'loss', 'content': 0.02045329287648201, 'timestamp': '2025-09-15 03:19:13.076519', 'step': 335, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:19:13.107170', 'step': 335, 'epoch': 1} {'type': 'loss', 'content': 0.02221083827316761, 'timestamp': '2025-09-15 03:19:13.130954', 'step': 336, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:13.161056', 'step': 336, 'epoch': 1} {'type': 'loss', 'content': 0.025785459205508232, 'timestamp': '2025-09-15 03:19:13.164065', 'step': 337, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:13.193949', 'step': 337, 'epoch': 1} {'type': 'loss', 'content': 0.0212246123701334, 'timestamp': '2025-09-15 03:19:13.196340', 'step': 338, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:13.225665', 'step': 338, 'epoch': 1} {'type': 'loss', 'content': 0.0205333661288023, 'timestamp': '2025-09-15 03:19:13.228179', 'step': 339, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:13.258176', 'step': 339, 'epoch': 1} {'type': 'loss', 'content': 0.01782972179353237, 'timestamp': '2025-09-15 03:19:13.281652', 'step': 340, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:13.312113', 'step': 340, 'epoch': 1} {'type': 'loss', 'content': 0.019433436915278435, 'timestamp': '2025-09-15 03:19:13.314493', 'step': 341, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:13.344611', 'step': 341, 'epoch': 1} {'type': 'loss', 'content': 0.02971636690199375, 'timestamp': '2025-09-15 03:19:13.346966', 'step': 342, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:19:14.057353', 'step': 342, 'epoch': 1} {'type': 'pplx', 'content': 103611561.54396842, 'timestamp': '2025-09-15 03:19:14.059253', 'step': 342, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:14.088526', 'step': 342, 'epoch': 1} {'type': 'loss', 'content': 0.020249316468834877, 'timestamp': '2025-09-15 03:19:14.090455', 'step': 343, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:14.122006', 'step': 343, 'epoch': 1} {'type': 'loss', 'content': 0.04564950615167618, 'timestamp': '2025-09-15 03:19:14.145521', 'step': 344, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:14.175077', 'step': 344, 'epoch': 1} {'type': 'loss', 'content': 0.024136368185281754, 'timestamp': '2025-09-15 03:19:14.177680', 'step': 345, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:14.209004', 'step': 345, 'epoch': 1} {'type': 'loss', 'content': 0.03156773000955582, 'timestamp': '2025-09-15 03:19:14.211186', 'step': 346, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:14.241016', 'step': 346, 'epoch': 1} {'type': 'loss', 'content': 0.024225564673542976, 'timestamp': '2025-09-15 03:19:14.243533', 'step': 347, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:14.273558', 'step': 347, 'epoch': 1} {'type': 'loss', 'content': 0.025378478690981865, 'timestamp': '2025-09-15 03:19:14.297318', 'step': 348, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:14.327983', 'step': 348, 'epoch': 1} {'type': 'loss', 'content': 0.0185211431235075, 'timestamp': '2025-09-15 03:19:14.330186', 'step': 349, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:14.360470', 'step': 349, 'epoch': 1} {'type': 'loss', 'content': 0.029448067769408226, 'timestamp': '2025-09-15 03:19:14.362492', 'step': 350, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:14.392848', 'step': 350, 'epoch': 1} {'type': 'loss', 'content': 0.03555438667535782, 'timestamp': '2025-09-15 03:19:14.395026', 'step': 351, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:14.426099', 'step': 351, 'epoch': 1} {'type': 'loss', 'content': 0.0238653514534235, 'timestamp': '2025-09-15 03:19:14.449561', 'step': 352, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:14.479231', 'step': 352, 'epoch': 1} {'type': 'loss', 'content': 0.03759421780705452, 'timestamp': '2025-09-15 03:19:14.481329', 'step': 353, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:14.511305', 'step': 353, 'epoch': 1} {'type': 'loss', 'content': 0.026176365092396736, 'timestamp': '2025-09-15 03:19:14.513427', 'step': 354, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:14.543093', 'step': 354, 'epoch': 1} {'type': 'loss', 'content': 0.02880324050784111, 'timestamp': '2025-09-15 03:19:14.545168', 'step': 355, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:14.575392', 'step': 355, 'epoch': 1} {'type': 'loss', 'content': 0.012668220326304436, 'timestamp': '2025-09-15 03:19:14.598912', 'step': 356, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:14.629312', 'step': 356, 'epoch': 1} {'type': 'loss', 'content': 0.017717767506837845, 'timestamp': '2025-09-15 03:19:14.631250', 'step': 357, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:14.661090', 'step': 357, 'epoch': 1} {'type': 'loss', 'content': 0.016119273379445076, 'timestamp': '2025-09-15 03:19:14.663260', 'step': 358, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:14.692787', 'step': 358, 'epoch': 1} {'type': 'loss', 'content': 0.011544623412191868, 'timestamp': '2025-09-15 03:19:14.694825', 'step': 359, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:14.724842', 'step': 359, 'epoch': 1} {'type': 'loss', 'content': 0.016963688656687737, 'timestamp': '2025-09-15 03:19:14.748390', 'step': 360, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:14.778525', 'step': 360, 'epoch': 1} {'type': 'loss', 'content': 0.0260551106184721, 'timestamp': '2025-09-15 03:19:14.780863', 'step': 361, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:14.812461', 'step': 361, 'epoch': 1} {'type': 'loss', 'content': 0.029832622036337852, 'timestamp': '2025-09-15 03:19:14.814710', 'step': 362, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:14.845126', 'step': 362, 'epoch': 1} {'type': 'loss', 'content': 0.020663557574152946, 'timestamp': '2025-09-15 03:19:14.847561', 'step': 363, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:14.878419', 'step': 363, 'epoch': 1} {'type': 'loss', 'content': 0.012055516242980957, 'timestamp': '2025-09-15 03:19:14.902621', 'step': 364, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:14.933484', 'step': 364, 'epoch': 1} {'type': 'loss', 'content': 0.02465776540338993, 'timestamp': '2025-09-15 03:19:14.936922', 'step': 365, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:14.967549', 'step': 365, 'epoch': 1} {'type': 'loss', 'content': 0.014459922909736633, 'timestamp': '2025-09-15 03:19:14.970185', 'step': 366, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:19:15.000758', 'step': 366, 'epoch': 1} {'type': 'loss', 'content': 0.017588822171092033, 'timestamp': '2025-09-15 03:19:15.003192', 'step': 367, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:15.033480', 'step': 367, 'epoch': 1} {'type': 'loss', 'content': 0.028651872649788857, 'timestamp': '2025-09-15 03:19:15.056929', 'step': 368, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:15.087563', 'step': 368, 'epoch': 1} {'type': 'loss', 'content': 0.022055622190237045, 'timestamp': '2025-09-15 03:19:15.089746', 'step': 369, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:15.120493', 'step': 369, 'epoch': 1} {'type': 'loss', 'content': 0.025541191920638084, 'timestamp': '2025-09-15 03:19:15.122779', 'step': 370, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:15.153139', 'step': 370, 'epoch': 1} {'type': 'loss', 'content': 0.01346271950751543, 'timestamp': '2025-09-15 03:19:15.155147', 'step': 371, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:15.184446', 'step': 371, 'epoch': 1} {'type': 'loss', 'content': 0.029344525188207626, 'timestamp': '2025-09-15 03:19:15.208168', 'step': 372, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:15.238243', 'step': 372, 'epoch': 1} {'type': 'loss', 'content': 0.02545652911067009, 'timestamp': '2025-09-15 03:19:15.240521', 'step': 373, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:15.270660', 'step': 373, 'epoch': 1} {'type': 'loss', 'content': 0.02384101040661335, 'timestamp': '2025-09-15 03:19:15.272978', 'step': 374, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:15.303721', 'step': 374, 'epoch': 1} {'type': 'loss', 'content': 0.01691906340420246, 'timestamp': '2025-09-15 03:19:15.305806', 'step': 375, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:15.336539', 'step': 375, 'epoch': 1} {'type': 'loss', 'content': 0.014407488517463207, 'timestamp': '2025-09-15 03:19:15.360275', 'step': 376, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:15.390488', 'step': 376, 'epoch': 1} {'type': 'loss', 'content': 0.01607523113489151, 'timestamp': '2025-09-15 03:19:15.393291', 'step': 377, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:15.422854', 'step': 377, 'epoch': 1} {'type': 'loss', 'content': 0.022174837067723274, 'timestamp': '2025-09-15 03:19:15.425325', 'step': 378, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:15.455251', 'step': 378, 'epoch': 1} {'type': 'loss', 'content': 0.019380640238523483, 'timestamp': '2025-09-15 03:19:15.457431', 'step': 379, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:15.487035', 'step': 379, 'epoch': 1} {'type': 'loss', 'content': 0.01895737834274769, 'timestamp': '2025-09-15 03:19:15.510478', 'step': 380, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:15.540937', 'step': 380, 'epoch': 1} {'type': 'loss', 'content': 0.02109598182141781, 'timestamp': '2025-09-15 03:19:15.543200', 'step': 381, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:15.573164', 'step': 381, 'epoch': 1} {'type': 'loss', 'content': 0.021322503685951233, 'timestamp': '2025-09-15 03:19:15.575138', 'step': 382, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:15.604655', 'step': 382, 'epoch': 1} {'type': 'loss', 'content': 0.016983283683657646, 'timestamp': '2025-09-15 03:19:15.606731', 'step': 383, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:15.636218', 'step': 383, 'epoch': 1} {'type': 'loss', 'content': 0.023025959730148315, 'timestamp': '2025-09-15 03:19:15.659533', 'step': 384, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:15.689472', 'step': 384, 'epoch': 1} {'type': 'loss', 'content': 0.01357300765812397, 'timestamp': '2025-09-15 03:19:15.691565', 'step': 385, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:15.721118', 'step': 385, 'epoch': 1} {'type': 'loss', 'content': 0.01553942821919918, 'timestamp': '2025-09-15 03:19:15.723381', 'step': 386, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:15.753008', 'step': 386, 'epoch': 1} {'type': 'loss', 'content': 0.016792044043540955, 'timestamp': '2025-09-15 03:19:15.755077', 'step': 387, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:15.784626', 'step': 387, 'epoch': 1} {'type': 'loss', 'content': 0.02156914956867695, 'timestamp': '2025-09-15 03:19:15.808187', 'step': 388, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:15.856953', 'step': 388, 'epoch': 1} {'type': 'loss', 'content': 0.017255697399377823, 'timestamp': '2025-09-15 03:19:15.859593', 'step': 389, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:15.889577', 'step': 389, 'epoch': 1} {'type': 'loss', 'content': 0.030908426269888878, 'timestamp': '2025-09-15 03:19:15.891774', 'step': 390, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:15.921381', 'step': 390, 'epoch': 1} {'type': 'loss', 'content': 0.035414181649684906, 'timestamp': '2025-09-15 03:19:15.923879', 'step': 391, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:15.953734', 'step': 391, 'epoch': 1} {'type': 'loss', 'content': 0.013454231433570385, 'timestamp': '2025-09-15 03:19:15.977493', 'step': 392, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:16.007644', 'step': 392, 'epoch': 1} {'type': 'loss', 'content': 0.01388184167444706, 'timestamp': '2025-09-15 03:19:16.009665', 'step': 393, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:19:16.039889', 'step': 393, 'epoch': 1} {'type': 'loss', 'content': 0.018477659672498703, 'timestamp': '2025-09-15 03:19:16.042544', 'step': 394, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:16.073246', 'step': 394, 'epoch': 1} {'type': 'loss', 'content': 0.00951298326253891, 'timestamp': '2025-09-15 03:19:16.075369', 'step': 395, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:16.105573', 'step': 395, 'epoch': 1} {'type': 'loss', 'content': 0.022019732743501663, 'timestamp': '2025-09-15 03:19:16.128995', 'step': 396, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:16.158640', 'step': 396, 'epoch': 1} {'type': 'loss', 'content': 0.007529764901846647, 'timestamp': '2025-09-15 03:19:16.160824', 'step': 397, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:16.190862', 'step': 397, 'epoch': 1} {'type': 'loss', 'content': 0.013377663679420948, 'timestamp': '2025-09-15 03:19:16.193028', 'step': 398, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:16.222718', 'step': 398, 'epoch': 1} {'type': 'loss', 'content': 0.02092798426747322, 'timestamp': '2025-09-15 03:19:16.224930', 'step': 399, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:19:16.937105', 'step': 399, 'epoch': 1} {'type': 'pplx', 'content': 106969624.85667004, 'timestamp': '2025-09-15 03:19:16.938941', 'step': 399, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:16.968244', 'step': 399, 'epoch': 1} {'type': 'loss', 'content': 0.016222558915615082, 'timestamp': '2025-09-15 03:19:16.991840', 'step': 400, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:17.022374', 'step': 400, 'epoch': 1} {'type': 'loss', 'content': 0.006949161179363728, 'timestamp': '2025-09-15 03:19:17.024691', 'step': 401, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:17.054915', 'step': 401, 'epoch': 1} {'type': 'loss', 'content': 0.021258166059851646, 'timestamp': '2025-09-15 03:19:17.057139', 'step': 402, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:17.087178', 'step': 402, 'epoch': 1} {'type': 'loss', 'content': 0.02063274383544922, 'timestamp': '2025-09-15 03:19:17.089528', 'step': 403, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:17.119255', 'step': 403, 'epoch': 1} {'type': 'loss', 'content': 0.0355384424328804, 'timestamp': '2025-09-15 03:19:17.143008', 'step': 404, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:17.173120', 'step': 404, 'epoch': 1} {'type': 'loss', 'content': 0.02840801514685154, 'timestamp': '2025-09-15 03:19:17.175236', 'step': 405, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:17.204937', 'step': 405, 'epoch': 1} {'type': 'loss', 'content': 0.035722844302654266, 'timestamp': '2025-09-15 03:19:17.207127', 'step': 406, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:17.236656', 'step': 406, 'epoch': 1} {'type': 'loss', 'content': 0.009866001084446907, 'timestamp': '2025-09-15 03:19:17.241929', 'step': 407, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:17.274706', 'step': 407, 'epoch': 1} {'type': 'loss', 'content': 0.007772717159241438, 'timestamp': '2025-09-15 03:19:17.298620', 'step': 408, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:17.328616', 'step': 408, 'epoch': 1} {'type': 'loss', 'content': 0.02096855826675892, 'timestamp': '2025-09-15 03:19:17.330936', 'step': 409, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:17.362169', 'step': 409, 'epoch': 1} {'type': 'loss', 'content': 0.021432984620332718, 'timestamp': '2025-09-15 03:19:17.364656', 'step': 410, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:17.395211', 'step': 410, 'epoch': 1} {'type': 'loss', 'content': 0.016841476783156395, 'timestamp': '2025-09-15 03:19:17.397416', 'step': 411, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:17.427588', 'step': 411, 'epoch': 1} {'type': 'loss', 'content': 0.03731584548950195, 'timestamp': '2025-09-15 03:19:17.451200', 'step': 412, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:17.481242', 'step': 412, 'epoch': 1} {'type': 'loss', 'content': 0.010997510515153408, 'timestamp': '2025-09-15 03:19:17.483095', 'step': 413, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:17.513132', 'step': 413, 'epoch': 1} {'type': 'loss', 'content': 0.03299867734313011, 'timestamp': '2025-09-15 03:19:17.515254', 'step': 414, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:17.544807', 'step': 414, 'epoch': 1} {'type': 'loss', 'content': 0.04234781861305237, 'timestamp': '2025-09-15 03:19:17.546814', 'step': 415, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:17.577458', 'step': 415, 'epoch': 1} {'type': 'loss', 'content': 0.008629407733678818, 'timestamp': '2025-09-15 03:19:17.601149', 'step': 416, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:17.631045', 'step': 416, 'epoch': 1} {'type': 'loss', 'content': 0.013242745772004128, 'timestamp': '2025-09-15 03:19:17.633065', 'step': 417, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:17.663210', 'step': 417, 'epoch': 1} {'type': 'loss', 'content': 0.02678370475769043, 'timestamp': '2025-09-15 03:19:17.665528', 'step': 418, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:17.695230', 'step': 418, 'epoch': 1} {'type': 'loss', 'content': 0.02190086431801319, 'timestamp': '2025-09-15 03:19:17.697529', 'step': 419, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:17.727255', 'step': 419, 'epoch': 1} {'type': 'loss', 'content': 0.01396601740270853, 'timestamp': '2025-09-15 03:19:17.750708', 'step': 420, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:17.780889', 'step': 420, 'epoch': 1} {'type': 'loss', 'content': 0.015750134363770485, 'timestamp': '2025-09-15 03:19:17.782968', 'step': 421, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:17.812631', 'step': 421, 'epoch': 1} {'type': 'loss', 'content': 0.014036260545253754, 'timestamp': '2025-09-15 03:19:17.814594', 'step': 422, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:17.845147', 'step': 422, 'epoch': 1} {'type': 'loss', 'content': 0.019544070586562157, 'timestamp': '2025-09-15 03:19:17.847507', 'step': 423, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:17.877464', 'step': 423, 'epoch': 1} {'type': 'loss', 'content': 0.01638009026646614, 'timestamp': '2025-09-15 03:19:17.900894', 'step': 424, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:17.931143', 'step': 424, 'epoch': 1} {'type': 'loss', 'content': 0.01974562369287014, 'timestamp': '2025-09-15 03:19:17.933286', 'step': 425, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:17.963320', 'step': 425, 'epoch': 1} {'type': 'loss', 'content': 0.022785307839512825, 'timestamp': '2025-09-15 03:19:17.965504', 'step': 426, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:17.995921', 'step': 426, 'epoch': 1} {'type': 'loss', 'content': 0.019113395363092422, 'timestamp': '2025-09-15 03:19:17.998035', 'step': 427, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:18.028032', 'step': 427, 'epoch': 1} {'type': 'loss', 'content': 0.013610420748591423, 'timestamp': '2025-09-15 03:19:18.051538', 'step': 428, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:18.081768', 'step': 428, 'epoch': 1} {'type': 'loss', 'content': 0.02456537075340748, 'timestamp': '2025-09-15 03:19:18.083833', 'step': 429, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:18.115577', 'step': 429, 'epoch': 1} {'type': 'loss', 'content': 0.02188361994922161, 'timestamp': '2025-09-15 03:19:18.118002', 'step': 430, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:18.148465', 'step': 430, 'epoch': 1} {'type': 'loss', 'content': 0.013410343788564205, 'timestamp': '2025-09-15 03:19:18.150469', 'step': 431, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:18.180379', 'step': 431, 'epoch': 1} {'type': 'loss', 'content': 0.02152617648243904, 'timestamp': '2025-09-15 03:19:18.204057', 'step': 432, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:18.234081', 'step': 432, 'epoch': 1} {'type': 'loss', 'content': 0.017813067883253098, 'timestamp': '2025-09-15 03:19:18.236156', 'step': 433, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:19:18.266139', 'step': 433, 'epoch': 1} {'type': 'loss', 'content': 0.02423473261296749, 'timestamp': '2025-09-15 03:19:18.268706', 'step': 434, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:18.298971', 'step': 434, 'epoch': 1} {'type': 'loss', 'content': 0.01806381531059742, 'timestamp': '2025-09-15 03:19:18.301299', 'step': 435, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:18.331157', 'step': 435, 'epoch': 1} {'type': 'loss', 'content': 0.01698886603116989, 'timestamp': '2025-09-15 03:19:18.355447', 'step': 436, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:18.385615', 'step': 436, 'epoch': 1} {'type': 'loss', 'content': 0.025869715958833694, 'timestamp': '2025-09-15 03:19:18.388025', 'step': 437, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:18.417735', 'step': 437, 'epoch': 1} {'type': 'loss', 'content': 0.02636927366256714, 'timestamp': '2025-09-15 03:19:18.420056', 'step': 438, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:18.449967', 'step': 438, 'epoch': 1} {'type': 'loss', 'content': 0.04045248404145241, 'timestamp': '2025-09-15 03:19:18.452124', 'step': 439, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:18.481852', 'step': 439, 'epoch': 1} {'type': 'loss', 'content': 0.02000604197382927, 'timestamp': '2025-09-15 03:19:18.505406', 'step': 440, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:18.535658', 'step': 440, 'epoch': 1} {'type': 'loss', 'content': 0.03176813945174217, 'timestamp': '2025-09-15 03:19:18.537779', 'step': 441, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:18.567747', 'step': 441, 'epoch': 1} {'type': 'loss', 'content': 0.03146418556571007, 'timestamp': '2025-09-15 03:19:18.569892', 'step': 442, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:18.604064', 'step': 442, 'epoch': 1} {'type': 'loss', 'content': 0.012470101937651634, 'timestamp': '2025-09-15 03:19:18.606130', 'step': 443, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:18.636695', 'step': 443, 'epoch': 1} {'type': 'loss', 'content': 0.03529360517859459, 'timestamp': '2025-09-15 03:19:18.660438', 'step': 444, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:18.690103', 'step': 444, 'epoch': 1} {'type': 'loss', 'content': 0.010506563819944859, 'timestamp': '2025-09-15 03:19:18.692093', 'step': 445, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:18.721987', 'step': 445, 'epoch': 1} {'type': 'loss', 'content': 0.019400015473365784, 'timestamp': '2025-09-15 03:19:18.724012', 'step': 446, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:18.754433', 'step': 446, 'epoch': 1} {'type': 'loss', 'content': 0.007648042868822813, 'timestamp': '2025-09-15 03:19:18.756768', 'step': 447, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:18.786715', 'step': 447, 'epoch': 1} {'type': 'loss', 'content': 0.013138832524418831, 'timestamp': '2025-09-15 03:19:18.810090', 'step': 448, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:18.840475', 'step': 448, 'epoch': 1} {'type': 'loss', 'content': 0.008471040055155754, 'timestamp': '2025-09-15 03:19:18.842879', 'step': 449, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:18.873896', 'step': 449, 'epoch': 1} {'type': 'loss', 'content': 0.030580993741750717, 'timestamp': '2025-09-15 03:19:18.876001', 'step': 450, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:18.906957', 'step': 450, 'epoch': 1} {'type': 'loss', 'content': 0.021732794120907784, 'timestamp': '2025-09-15 03:19:18.909053', 'step': 451, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:18.939066', 'step': 451, 'epoch': 1} {'type': 'loss', 'content': 0.008800463750958443, 'timestamp': '2025-09-15 03:19:18.964526', 'step': 452, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:18.995324', 'step': 452, 'epoch': 1} {'type': 'loss', 'content': 0.017960723489522934, 'timestamp': '2025-09-15 03:19:18.997206', 'step': 453, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:19:19.027401', 'step': 453, 'epoch': 1} {'type': 'loss', 'content': 0.019798072054982185, 'timestamp': '2025-09-15 03:19:19.030007', 'step': 454, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:19.060014', 'step': 454, 'epoch': 1} {'type': 'loss', 'content': 0.0256817489862442, 'timestamp': '2025-09-15 03:19:19.062247', 'step': 455, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:19.092101', 'step': 455, 'epoch': 1} {'type': 'loss', 'content': 0.012163102626800537, 'timestamp': '2025-09-15 03:19:19.115459', 'step': 456, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:19:19.830229', 'step': 456, 'epoch': 1} {'type': 'pplx', 'content': 106593515.23342364, 'timestamp': '2025-09-15 03:19:19.832687', 'step': 456, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:19.860789', 'step': 456, 'epoch': 1} {'type': 'loss', 'content': 0.022691946476697922, 'timestamp': '2025-09-15 03:19:19.862834', 'step': 457, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:19.894401', 'step': 457, 'epoch': 1} {'type': 'loss', 'content': 0.033485524356365204, 'timestamp': '2025-09-15 03:19:19.896450', 'step': 458, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:19.926149', 'step': 458, 'epoch': 1} {'type': 'loss', 'content': 0.011042545549571514, 'timestamp': '2025-09-15 03:19:19.928445', 'step': 459, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:19.958705', 'step': 459, 'epoch': 1} {'type': 'loss', 'content': 0.032331619411706924, 'timestamp': '2025-09-15 03:19:19.982465', 'step': 460, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:20.012147', 'step': 460, 'epoch': 1} {'type': 'loss', 'content': 0.018903549760580063, 'timestamp': '2025-09-15 03:19:20.014388', 'step': 461, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:20.044231', 'step': 461, 'epoch': 1} {'type': 'loss', 'content': 0.011640116572380066, 'timestamp': '2025-09-15 03:19:20.046361', 'step': 462, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:20.076555', 'step': 462, 'epoch': 1} {'type': 'loss', 'content': 0.009794848039746284, 'timestamp': '2025-09-15 03:19:20.078607', 'step': 463, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:20.108563', 'step': 463, 'epoch': 1} {'type': 'loss', 'content': 0.02081950567662716, 'timestamp': '2025-09-15 03:19:20.132053', 'step': 464, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:20.162248', 'step': 464, 'epoch': 1} {'type': 'loss', 'content': 0.01612214185297489, 'timestamp': '2025-09-15 03:19:20.164354', 'step': 465, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:20.194560', 'step': 465, 'epoch': 1} {'type': 'loss', 'content': 0.014096572063863277, 'timestamp': '2025-09-15 03:19:20.196538', 'step': 466, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:20.226672', 'step': 466, 'epoch': 1} {'type': 'loss', 'content': 0.019644200801849365, 'timestamp': '2025-09-15 03:19:20.228929', 'step': 467, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:20.258638', 'step': 467, 'epoch': 1} {'type': 'loss', 'content': 0.02487204596400261, 'timestamp': '2025-09-15 03:19:20.282026', 'step': 468, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:20.311901', 'step': 468, 'epoch': 1} {'type': 'loss', 'content': 0.02327965758740902, 'timestamp': '2025-09-15 03:19:20.314077', 'step': 469, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:20.343694', 'step': 469, 'epoch': 1} {'type': 'loss', 'content': 0.03325268253684044, 'timestamp': '2025-09-15 03:19:20.345817', 'step': 470, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:20.375868', 'step': 470, 'epoch': 1} {'type': 'loss', 'content': 0.0069459849037230015, 'timestamp': '2025-09-15 03:19:20.378369', 'step': 471, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:20.408212', 'step': 471, 'epoch': 1} {'type': 'loss', 'content': 0.01827104203402996, 'timestamp': '2025-09-15 03:19:20.431708', 'step': 472, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:20.462068', 'step': 472, 'epoch': 1} {'type': 'loss', 'content': 0.019391119480133057, 'timestamp': '2025-09-15 03:19:20.464671', 'step': 473, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:20.494805', 'step': 473, 'epoch': 1} {'type': 'loss', 'content': 0.013398611918091774, 'timestamp': '2025-09-15 03:19:20.497228', 'step': 474, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:20.526884', 'step': 474, 'epoch': 1} {'type': 'loss', 'content': 0.011475841514766216, 'timestamp': '2025-09-15 03:19:20.529188', 'step': 475, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:20.558975', 'step': 475, 'epoch': 1} {'type': 'loss', 'content': 0.012057404033839703, 'timestamp': '2025-09-15 03:19:20.582317', 'step': 476, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:20.611838', 'step': 476, 'epoch': 1} {'type': 'loss', 'content': 0.013628379441797733, 'timestamp': '2025-09-15 03:19:20.614030', 'step': 477, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:20.644198', 'step': 477, 'epoch': 1} {'type': 'loss', 'content': 0.013962375931441784, 'timestamp': '2025-09-15 03:19:20.646347', 'step': 478, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:20.676826', 'step': 478, 'epoch': 1} {'type': 'loss', 'content': 0.014084680937230587, 'timestamp': '2025-09-15 03:19:20.678829', 'step': 479, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:20.708906', 'step': 479, 'epoch': 1} {'type': 'loss', 'content': 0.007172218058258295, 'timestamp': '2025-09-15 03:19:20.732339', 'step': 480, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:20.761832', 'step': 480, 'epoch': 1} {'type': 'loss', 'content': 0.02609698474407196, 'timestamp': '2025-09-15 03:19:20.763996', 'step': 481, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:20.793904', 'step': 481, 'epoch': 1} {'type': 'loss', 'content': 0.027867257595062256, 'timestamp': '2025-09-15 03:19:20.795800', 'step': 482, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:20.826089', 'step': 482, 'epoch': 1} {'type': 'loss', 'content': 0.012075444683432579, 'timestamp': '2025-09-15 03:19:20.828206', 'step': 483, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:20.858430', 'step': 483, 'epoch': 1} {'type': 'loss', 'content': 0.01756882481276989, 'timestamp': '2025-09-15 03:19:20.883340', 'step': 484, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:20.913996', 'step': 484, 'epoch': 1} {'type': 'loss', 'content': 0.01686927303671837, 'timestamp': '2025-09-15 03:19:20.916170', 'step': 485, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:20.945871', 'step': 485, 'epoch': 1} {'type': 'loss', 'content': 0.013569544069468975, 'timestamp': '2025-09-15 03:19:20.948173', 'step': 486, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:20.978420', 'step': 486, 'epoch': 1} {'type': 'loss', 'content': 0.027030980214476585, 'timestamp': '2025-09-15 03:19:20.980683', 'step': 487, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:21.011351', 'step': 487, 'epoch': 1} {'type': 'loss', 'content': 0.021866770461201668, 'timestamp': '2025-09-15 03:19:21.034950', 'step': 488, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:21.064837', 'step': 488, 'epoch': 1} {'type': 'loss', 'content': 0.015595144592225552, 'timestamp': '2025-09-15 03:19:21.066964', 'step': 489, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:21.096985', 'step': 489, 'epoch': 1} {'type': 'loss', 'content': 0.011602642014622688, 'timestamp': '2025-09-15 03:19:21.099366', 'step': 490, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:21.128752', 'step': 490, 'epoch': 1} {'type': 'loss', 'content': 0.01822063885629177, 'timestamp': '2025-09-15 03:19:21.130765', 'step': 491, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:21.160810', 'step': 491, 'epoch': 1} {'type': 'loss', 'content': 0.008895600214600563, 'timestamp': '2025-09-15 03:19:21.184264', 'step': 492, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:21.214166', 'step': 492, 'epoch': 1} {'type': 'loss', 'content': 0.008032863028347492, 'timestamp': '2025-09-15 03:19:21.216142', 'step': 493, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:21.245758', 'step': 493, 'epoch': 1} {'type': 'loss', 'content': 0.03705013543367386, 'timestamp': '2025-09-15 03:19:21.247899', 'step': 494, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:21.277937', 'step': 494, 'epoch': 1} {'type': 'loss', 'content': 0.012541859410703182, 'timestamp': '2025-09-15 03:19:21.280004', 'step': 495, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:21.312454', 'step': 495, 'epoch': 1} {'type': 'loss', 'content': 0.01749964989721775, 'timestamp': '2025-09-15 03:19:21.336261', 'step': 496, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:21.366956', 'step': 496, 'epoch': 1} {'type': 'loss', 'content': 0.00825695414096117, 'timestamp': '2025-09-15 03:19:21.369467', 'step': 497, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:21.399465', 'step': 497, 'epoch': 1} {'type': 'loss', 'content': 0.0126886498183012, 'timestamp': '2025-09-15 03:19:21.401727', 'step': 498, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:21.432115', 'step': 498, 'epoch': 1} {'type': 'loss', 'content': 0.01961720548570156, 'timestamp': '2025-09-15 03:19:21.434180', 'step': 499, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:21.463839', 'step': 499, 'epoch': 1} {'type': 'loss', 'content': 0.006928745657205582, 'timestamp': '2025-09-15 03:19:21.487376', 'step': 500, 'epoch': 1} {'type': 'info', 'content': 'Checkpoint saved at step 500', 'timestamp': '2025-09-15 03:19:27.943224', 'step': 500, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:27.980027', 'step': 500, 'epoch': 1} {'type': 'loss', 'content': 0.00608115503564477, 'timestamp': '2025-09-15 03:19:27.982317', 'step': 501, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:28.013857', 'step': 501, 'epoch': 1} {'type': 'loss', 'content': 0.018389714881777763, 'timestamp': '2025-09-15 03:19:28.016328', 'step': 502, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:28.046536', 'step': 502, 'epoch': 1} {'type': 'loss', 'content': 0.014615455642342567, 'timestamp': '2025-09-15 03:19:28.048877', 'step': 503, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:28.079057', 'step': 503, 'epoch': 1} {'type': 'loss', 'content': 0.018191883340477943, 'timestamp': '2025-09-15 03:19:28.102793', 'step': 504, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:28.132932', 'step': 504, 'epoch': 1} {'type': 'loss', 'content': 0.00259787798859179, 'timestamp': '2025-09-15 03:19:28.135309', 'step': 505, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:28.165002', 'step': 505, 'epoch': 1} {'type': 'loss', 'content': 0.050298064947128296, 'timestamp': '2025-09-15 03:19:28.167124', 'step': 506, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:28.197378', 'step': 506, 'epoch': 1} {'type': 'loss', 'content': 0.028524816036224365, 'timestamp': '2025-09-15 03:19:28.199756', 'step': 507, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:28.230000', 'step': 507, 'epoch': 1} {'type': 'loss', 'content': 0.03299025818705559, 'timestamp': '2025-09-15 03:19:28.253871', 'step': 508, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:28.284729', 'step': 508, 'epoch': 1} {'type': 'loss', 'content': 0.005647748708724976, 'timestamp': '2025-09-15 03:19:28.287110', 'step': 509, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:28.317178', 'step': 509, 'epoch': 1} {'type': 'loss', 'content': 0.01253785751760006, 'timestamp': '2025-09-15 03:19:28.319089', 'step': 510, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:28.349076', 'step': 510, 'epoch': 1} {'type': 'loss', 'content': 0.03369123488664627, 'timestamp': '2025-09-15 03:19:28.351251', 'step': 511, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:28.382327', 'step': 511, 'epoch': 1} {'type': 'loss', 'content': 0.003224168438464403, 'timestamp': '2025-09-15 03:19:28.406167', 'step': 512, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:28.436996', 'step': 512, 'epoch': 1} {'type': 'loss', 'content': 0.02117718942463398, 'timestamp': '2025-09-15 03:19:28.439068', 'step': 513, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:19:29.155360', 'step': 513, 'epoch': 1} {'type': 'pplx', 'content': 103196321.15215471, 'timestamp': '2025-09-15 03:19:29.157619', 'step': 513, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:29.186536', 'step': 513, 'epoch': 1} {'type': 'loss', 'content': 0.005289103835821152, 'timestamp': '2025-09-15 03:19:29.188811', 'step': 514, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:29.219031', 'step': 514, 'epoch': 1} {'type': 'loss', 'content': 0.019699811935424805, 'timestamp': '2025-09-15 03:19:29.222466', 'step': 515, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:29.253330', 'step': 515, 'epoch': 1} {'type': 'loss', 'content': 0.025011127814650536, 'timestamp': '2025-09-15 03:19:29.276960', 'step': 516, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:29.306759', 'step': 516, 'epoch': 1} {'type': 'loss', 'content': 0.032631583511829376, 'timestamp': '2025-09-15 03:19:29.308971', 'step': 517, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:29.338686', 'step': 517, 'epoch': 1} {'type': 'loss', 'content': 0.027445856481790543, 'timestamp': '2025-09-15 03:19:29.340803', 'step': 518, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:29.370764', 'step': 518, 'epoch': 1} {'type': 'loss', 'content': 0.01845047064125538, 'timestamp': '2025-09-15 03:19:29.372918', 'step': 519, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:29.403375', 'step': 519, 'epoch': 1} {'type': 'loss', 'content': 0.00743445660918951, 'timestamp': '2025-09-15 03:19:29.427047', 'step': 520, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:29.456855', 'step': 520, 'epoch': 1} {'type': 'loss', 'content': 0.02367217466235161, 'timestamp': '2025-09-15 03:19:29.459092', 'step': 521, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:29.488834', 'step': 521, 'epoch': 1} {'type': 'loss', 'content': 0.022589536383748055, 'timestamp': '2025-09-15 03:19:29.491088', 'step': 522, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:29.521721', 'step': 522, 'epoch': 1} {'type': 'loss', 'content': 0.007371582556515932, 'timestamp': '2025-09-15 03:19:29.523929', 'step': 523, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:29.554220', 'step': 523, 'epoch': 1} {'type': 'loss', 'content': 0.0034837510902434587, 'timestamp': '2025-09-15 03:19:29.578015', 'step': 524, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:29.608232', 'step': 524, 'epoch': 1} {'type': 'loss', 'content': 0.012912024743855, 'timestamp': '2025-09-15 03:19:29.610343', 'step': 525, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:29.641765', 'step': 525, 'epoch': 1} {'type': 'loss', 'content': 0.02050393633544445, 'timestamp': '2025-09-15 03:19:29.643979', 'step': 526, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:29.673935', 'step': 526, 'epoch': 1} {'type': 'loss', 'content': 0.009708769619464874, 'timestamp': '2025-09-15 03:19:29.676201', 'step': 527, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:29.705996', 'step': 527, 'epoch': 1} {'type': 'loss', 'content': 0.014362855814397335, 'timestamp': '2025-09-15 03:19:29.729561', 'step': 528, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:29.759730', 'step': 528, 'epoch': 1} {'type': 'loss', 'content': 0.009035781025886536, 'timestamp': '2025-09-15 03:19:29.762308', 'step': 529, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:29.792112', 'step': 529, 'epoch': 1} {'type': 'loss', 'content': 0.007510147523134947, 'timestamp': '2025-09-15 03:19:29.794341', 'step': 530, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:29.824151', 'step': 530, 'epoch': 1} {'type': 'loss', 'content': 0.0246110912412405, 'timestamp': '2025-09-15 03:19:29.826313', 'step': 531, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:29.856838', 'step': 531, 'epoch': 1} {'type': 'loss', 'content': 0.015608777292072773, 'timestamp': '2025-09-15 03:19:29.880601', 'step': 532, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:29.910643', 'step': 532, 'epoch': 1} {'type': 'loss', 'content': 0.01901078037917614, 'timestamp': '2025-09-15 03:19:29.912820', 'step': 533, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:29.942909', 'step': 533, 'epoch': 1} {'type': 'loss', 'content': 0.036900751292705536, 'timestamp': '2025-09-15 03:19:29.945423', 'step': 534, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:29.975951', 'step': 534, 'epoch': 1} {'type': 'loss', 'content': 0.02328217215836048, 'timestamp': '2025-09-15 03:19:29.978717', 'step': 535, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:30.009678', 'step': 535, 'epoch': 1} {'type': 'loss', 'content': 0.00571950851008296, 'timestamp': '2025-09-15 03:19:30.033333', 'step': 536, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:30.064355', 'step': 536, 'epoch': 1} {'type': 'loss', 'content': 0.036896418780088425, 'timestamp': '2025-09-15 03:19:30.066492', 'step': 537, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:30.096238', 'step': 537, 'epoch': 1} {'type': 'loss', 'content': 0.015397454611957073, 'timestamp': '2025-09-15 03:19:30.098994', 'step': 538, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:30.129739', 'step': 538, 'epoch': 1} {'type': 'loss', 'content': 0.057132985442876816, 'timestamp': '2025-09-15 03:19:30.133129', 'step': 539, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:30.163818', 'step': 539, 'epoch': 1} {'type': 'loss', 'content': 0.017368067055940628, 'timestamp': '2025-09-15 03:19:30.188068', 'step': 540, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:30.218982', 'step': 540, 'epoch': 1} {'type': 'loss', 'content': 0.033790573477745056, 'timestamp': '2025-09-15 03:19:30.221931', 'step': 541, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:30.252705', 'step': 541, 'epoch': 1} {'type': 'loss', 'content': 0.014461012557148933, 'timestamp': '2025-09-15 03:19:30.255093', 'step': 542, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:30.285723', 'step': 542, 'epoch': 1} {'type': 'loss', 'content': 0.005903469864279032, 'timestamp': '2025-09-15 03:19:30.288147', 'step': 543, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:30.318215', 'step': 543, 'epoch': 1} {'type': 'loss', 'content': 0.0222102589905262, 'timestamp': '2025-09-15 03:19:30.342045', 'step': 544, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:30.372445', 'step': 544, 'epoch': 1} {'type': 'loss', 'content': 0.030389001592993736, 'timestamp': '2025-09-15 03:19:30.374463', 'step': 545, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:30.404540', 'step': 545, 'epoch': 1} {'type': 'loss', 'content': 0.030530178919434547, 'timestamp': '2025-09-15 03:19:30.406642', 'step': 546, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:30.436463', 'step': 546, 'epoch': 1} {'type': 'loss', 'content': 0.043474406003952026, 'timestamp': '2025-09-15 03:19:30.438647', 'step': 547, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:30.469399', 'step': 547, 'epoch': 1} {'type': 'loss', 'content': 0.025674572214484215, 'timestamp': '2025-09-15 03:19:30.493123', 'step': 548, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:30.523887', 'step': 548, 'epoch': 1} {'type': 'loss', 'content': 0.039770692586898804, 'timestamp': '2025-09-15 03:19:30.526123', 'step': 549, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:30.555752', 'step': 549, 'epoch': 1} {'type': 'loss', 'content': 0.028351102024316788, 'timestamp': '2025-09-15 03:19:30.558088', 'step': 550, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:30.587947', 'step': 550, 'epoch': 1} {'type': 'loss', 'content': 0.024992715567350388, 'timestamp': '2025-09-15 03:19:30.590513', 'step': 551, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:30.620647', 'step': 551, 'epoch': 1} {'type': 'loss', 'content': 0.0335613451898098, 'timestamp': '2025-09-15 03:19:30.644598', 'step': 552, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:30.674895', 'step': 552, 'epoch': 1} {'type': 'loss', 'content': 0.03769141435623169, 'timestamp': '2025-09-15 03:19:30.677221', 'step': 553, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:30.707719', 'step': 553, 'epoch': 1} {'type': 'loss', 'content': 0.0256858728826046, 'timestamp': '2025-09-15 03:19:30.710855', 'step': 554, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:30.743064', 'step': 554, 'epoch': 1} {'type': 'loss', 'content': 0.008078259415924549, 'timestamp': '2025-09-15 03:19:30.745361', 'step': 555, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:30.775339', 'step': 555, 'epoch': 1} {'type': 'loss', 'content': 0.033207204192876816, 'timestamp': '2025-09-15 03:19:30.798821', 'step': 556, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:30.828644', 'step': 556, 'epoch': 1} {'type': 'loss', 'content': 0.027974357828497887, 'timestamp': '2025-09-15 03:19:30.831194', 'step': 557, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:30.861194', 'step': 557, 'epoch': 1} {'type': 'loss', 'content': 0.017816243693232536, 'timestamp': '2025-09-15 03:19:30.868426', 'step': 558, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:30.898405', 'step': 558, 'epoch': 1} {'type': 'loss', 'content': 0.026075145229697227, 'timestamp': '2025-09-15 03:19:30.900764', 'step': 559, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:30.943455', 'step': 559, 'epoch': 1} {'type': 'loss', 'content': 0.010549420490860939, 'timestamp': '2025-09-15 03:19:30.967281', 'step': 560, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:30.997163', 'step': 560, 'epoch': 1} {'type': 'loss', 'content': 0.01911732740700245, 'timestamp': '2025-09-15 03:19:30.999762', 'step': 561, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:31.029528', 'step': 561, 'epoch': 1} {'type': 'loss', 'content': 0.017765216529369354, 'timestamp': '2025-09-15 03:19:31.031797', 'step': 562, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:31.061228', 'step': 562, 'epoch': 1} {'type': 'loss', 'content': 0.018333345651626587, 'timestamp': '2025-09-15 03:19:31.063508', 'step': 563, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:31.093491', 'step': 563, 'epoch': 1} {'type': 'loss', 'content': 0.026418427005410194, 'timestamp': '2025-09-15 03:19:31.117230', 'step': 564, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:31.147881', 'step': 564, 'epoch': 1} {'type': 'loss', 'content': 0.02311128005385399, 'timestamp': '2025-09-15 03:19:31.150294', 'step': 565, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:31.180101', 'step': 565, 'epoch': 1} {'type': 'loss', 'content': 0.018931102007627487, 'timestamp': '2025-09-15 03:19:31.182476', 'step': 566, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:31.212574', 'step': 566, 'epoch': 1} {'type': 'loss', 'content': 0.017706727609038353, 'timestamp': '2025-09-15 03:19:31.214942', 'step': 567, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:19:31.245282', 'step': 567, 'epoch': 1} {'type': 'loss', 'content': 0.022706808522343636, 'timestamp': '2025-09-15 03:19:31.270109', 'step': 568, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:31.302088', 'step': 568, 'epoch': 1} {'type': 'loss', 'content': 0.03452968969941139, 'timestamp': '2025-09-15 03:19:31.304874', 'step': 569, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:31.334973', 'step': 569, 'epoch': 1} {'type': 'loss', 'content': 0.01722005195915699, 'timestamp': '2025-09-15 03:19:31.337600', 'step': 570, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:19:32.064332', 'step': 570, 'epoch': 1} {'type': 'pplx', 'content': 121926762.5949161, 'timestamp': '2025-09-15 03:19:32.066255', 'step': 570, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:32.095386', 'step': 570, 'epoch': 1} {'type': 'loss', 'content': 0.015566927380859852, 'timestamp': '2025-09-15 03:19:32.097639', 'step': 571, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:32.127943', 'step': 571, 'epoch': 1} {'type': 'loss', 'content': 0.01759842224419117, 'timestamp': '2025-09-15 03:19:32.151554', 'step': 572, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:32.181411', 'step': 572, 'epoch': 1} {'type': 'loss', 'content': 0.0242477860301733, 'timestamp': '2025-09-15 03:19:32.183399', 'step': 573, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:32.214246', 'step': 573, 'epoch': 1} {'type': 'loss', 'content': 0.01448657363653183, 'timestamp': '2025-09-15 03:19:32.216438', 'step': 574, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:32.246880', 'step': 574, 'epoch': 1} {'type': 'loss', 'content': 0.02040638029575348, 'timestamp': '2025-09-15 03:19:32.248954', 'step': 575, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:32.279047', 'step': 575, 'epoch': 1} {'type': 'loss', 'content': 0.016124863177537918, 'timestamp': '2025-09-15 03:19:32.302415', 'step': 576, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:32.332806', 'step': 576, 'epoch': 1} {'type': 'loss', 'content': 0.021785302087664604, 'timestamp': '2025-09-15 03:19:32.334799', 'step': 577, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:19:32.372166', 'step': 577, 'epoch': 1} {'type': 'loss', 'content': 0.0231538824737072, 'timestamp': '2025-09-15 03:19:32.374930', 'step': 578, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:32.405741', 'step': 578, 'epoch': 1} {'type': 'loss', 'content': 0.01654941961169243, 'timestamp': '2025-09-15 03:19:32.409225', 'step': 579, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:32.440079', 'step': 579, 'epoch': 1} {'type': 'loss', 'content': 0.013618906028568745, 'timestamp': '2025-09-15 03:19:32.463532', 'step': 580, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:32.494012', 'step': 580, 'epoch': 1} {'type': 'loss', 'content': 0.016907773911952972, 'timestamp': '2025-09-15 03:19:32.496305', 'step': 581, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:32.526226', 'step': 581, 'epoch': 1} {'type': 'loss', 'content': 0.012379102408885956, 'timestamp': '2025-09-15 03:19:32.529435', 'step': 582, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:32.561344', 'step': 582, 'epoch': 1} {'type': 'loss', 'content': 0.022719666361808777, 'timestamp': '2025-09-15 03:19:32.563677', 'step': 583, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:32.593679', 'step': 583, 'epoch': 1} {'type': 'loss', 'content': 0.01793176494538784, 'timestamp': '2025-09-15 03:19:32.617414', 'step': 584, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:32.648356', 'step': 584, 'epoch': 1} {'type': 'loss', 'content': 0.016040056943893433, 'timestamp': '2025-09-15 03:19:32.650958', 'step': 585, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:32.681070', 'step': 585, 'epoch': 1} {'type': 'loss', 'content': 0.01226879097521305, 'timestamp': '2025-09-15 03:19:32.683566', 'step': 586, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:32.713784', 'step': 586, 'epoch': 1} {'type': 'loss', 'content': 0.0086965998634696, 'timestamp': '2025-09-15 03:19:32.716034', 'step': 587, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:32.745741', 'step': 587, 'epoch': 1} {'type': 'loss', 'content': 0.026391183957457542, 'timestamp': '2025-09-15 03:19:32.769225', 'step': 588, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:32.798696', 'step': 588, 'epoch': 1} {'type': 'loss', 'content': 0.024788683280348778, 'timestamp': '2025-09-15 03:19:32.800849', 'step': 589, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:32.830603', 'step': 589, 'epoch': 1} {'type': 'loss', 'content': 0.02367064729332924, 'timestamp': '2025-09-15 03:19:32.832586', 'step': 590, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:19:32.880943', 'step': 590, 'epoch': 1} {'type': 'loss', 'content': 0.02348589338362217, 'timestamp': '2025-09-15 03:19:32.883116', 'step': 591, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:32.913202', 'step': 591, 'epoch': 1} {'type': 'loss', 'content': 0.017809707671403885, 'timestamp': '2025-09-15 03:19:32.937348', 'step': 592, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:32.967478', 'step': 592, 'epoch': 1} {'type': 'loss', 'content': 0.02226695977151394, 'timestamp': '2025-09-15 03:19:32.969704', 'step': 593, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:33.000156', 'step': 593, 'epoch': 1} {'type': 'loss', 'content': 0.01061128918081522, 'timestamp': '2025-09-15 03:19:33.002061', 'step': 594, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:33.031555', 'step': 594, 'epoch': 1} {'type': 'loss', 'content': 0.018722638487815857, 'timestamp': '2025-09-15 03:19:33.033744', 'step': 595, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:33.065088', 'step': 595, 'epoch': 1} {'type': 'loss', 'content': 0.0316433385014534, 'timestamp': '2025-09-15 03:19:33.088691', 'step': 596, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:33.119482', 'step': 596, 'epoch': 1} {'type': 'loss', 'content': 0.020807506516575813, 'timestamp': '2025-09-15 03:19:33.121812', 'step': 597, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:33.151699', 'step': 597, 'epoch': 1} {'type': 'loss', 'content': 0.016805419698357582, 'timestamp': '2025-09-15 03:19:33.153659', 'step': 598, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:33.184121', 'step': 598, 'epoch': 1} {'type': 'loss', 'content': 0.03364928811788559, 'timestamp': '2025-09-15 03:19:33.186308', 'step': 599, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:33.216288', 'step': 599, 'epoch': 1} {'type': 'loss', 'content': 0.02124728076159954, 'timestamp': '2025-09-15 03:19:33.240750', 'step': 600, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:33.270946', 'step': 600, 'epoch': 1} {'type': 'loss', 'content': 0.02416917122900486, 'timestamp': '2025-09-15 03:19:33.273304', 'step': 601, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:33.303700', 'step': 601, 'epoch': 1} {'type': 'loss', 'content': 0.022851431742310524, 'timestamp': '2025-09-15 03:19:33.305762', 'step': 602, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:33.336610', 'step': 602, 'epoch': 1} {'type': 'loss', 'content': 0.014495198614895344, 'timestamp': '2025-09-15 03:19:33.338762', 'step': 603, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:33.371483', 'step': 603, 'epoch': 1} {'type': 'loss', 'content': 0.02507566474378109, 'timestamp': '2025-09-15 03:19:33.394809', 'step': 604, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:33.425072', 'step': 604, 'epoch': 1} {'type': 'loss', 'content': 0.019076064229011536, 'timestamp': '2025-09-15 03:19:33.427298', 'step': 605, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:33.457171', 'step': 605, 'epoch': 1} {'type': 'loss', 'content': 0.018669435754418373, 'timestamp': '2025-09-15 03:19:33.459227', 'step': 606, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:33.489602', 'step': 606, 'epoch': 1} {'type': 'loss', 'content': 0.018617525696754456, 'timestamp': '2025-09-15 03:19:33.491896', 'step': 607, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:33.523534', 'step': 607, 'epoch': 1} {'type': 'loss', 'content': 0.015499304048717022, 'timestamp': '2025-09-15 03:19:33.547160', 'step': 608, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:33.577007', 'step': 608, 'epoch': 1} {'type': 'loss', 'content': 0.019901065155863762, 'timestamp': '2025-09-15 03:19:33.579301', 'step': 609, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:33.609797', 'step': 609, 'epoch': 1} {'type': 'loss', 'content': 0.019307816401124, 'timestamp': '2025-09-15 03:19:33.612044', 'step': 610, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:33.642385', 'step': 610, 'epoch': 1} {'type': 'loss', 'content': 0.02826797403395176, 'timestamp': '2025-09-15 03:19:33.644622', 'step': 611, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:33.674634', 'step': 611, 'epoch': 1} {'type': 'loss', 'content': 0.03018580749630928, 'timestamp': '2025-09-15 03:19:33.698326', 'step': 612, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:33.728321', 'step': 612, 'epoch': 1} {'type': 'loss', 'content': 0.023904601112008095, 'timestamp': '2025-09-15 03:19:33.730297', 'step': 613, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:33.760108', 'step': 613, 'epoch': 1} {'type': 'loss', 'content': 0.014215878210961819, 'timestamp': '2025-09-15 03:19:33.762305', 'step': 614, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:33.792435', 'step': 614, 'epoch': 1} {'type': 'loss', 'content': 0.031668949872255325, 'timestamp': '2025-09-15 03:19:33.794748', 'step': 615, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:33.825077', 'step': 615, 'epoch': 1} {'type': 'loss', 'content': 0.020902784541249275, 'timestamp': '2025-09-15 03:19:33.848960', 'step': 616, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:33.878880', 'step': 616, 'epoch': 1} {'type': 'loss', 'content': 0.032585740089416504, 'timestamp': '2025-09-15 03:19:33.881426', 'step': 617, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:33.911931', 'step': 617, 'epoch': 1} {'type': 'loss', 'content': 0.016043754294514656, 'timestamp': '2025-09-15 03:19:33.913997', 'step': 618, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:33.944384', 'step': 618, 'epoch': 1} {'type': 'loss', 'content': 0.024471482262015343, 'timestamp': '2025-09-15 03:19:33.946630', 'step': 619, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:33.979520', 'step': 619, 'epoch': 1} {'type': 'loss', 'content': 0.016859030351042747, 'timestamp': '2025-09-15 03:19:34.003549', 'step': 620, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:34.033537', 'step': 620, 'epoch': 1} {'type': 'loss', 'content': 0.016345178708434105, 'timestamp': '2025-09-15 03:19:34.035506', 'step': 621, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:34.065306', 'step': 621, 'epoch': 1} {'type': 'loss', 'content': 0.015494337305426598, 'timestamp': '2025-09-15 03:19:34.067448', 'step': 622, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:34.097216', 'step': 622, 'epoch': 1} {'type': 'loss', 'content': 0.016618309542536736, 'timestamp': '2025-09-15 03:19:34.099186', 'step': 623, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:34.128811', 'step': 623, 'epoch': 1} {'type': 'loss', 'content': 0.03520876169204712, 'timestamp': '2025-09-15 03:19:34.152306', 'step': 624, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:34.184480', 'step': 624, 'epoch': 1} {'type': 'loss', 'content': 0.017803248018026352, 'timestamp': '2025-09-15 03:19:34.186904', 'step': 625, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:34.216925', 'step': 625, 'epoch': 1} {'type': 'loss', 'content': 0.011919400654733181, 'timestamp': '2025-09-15 03:19:34.218932', 'step': 626, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:34.249088', 'step': 626, 'epoch': 1} {'type': 'loss', 'content': 0.006866747047752142, 'timestamp': '2025-09-15 03:19:34.251058', 'step': 627, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:19:34.966517', 'step': 627, 'epoch': 1} {'type': 'pplx', 'content': 113143370.79268838, 'timestamp': '2025-09-15 03:19:34.968536', 'step': 627, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:34.997423', 'step': 627, 'epoch': 1} {'type': 'loss', 'content': 0.009564869105815887, 'timestamp': '2025-09-15 03:19:35.021678', 'step': 628, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:35.052415', 'step': 628, 'epoch': 1} {'type': 'loss', 'content': 0.0398293137550354, 'timestamp': '2025-09-15 03:19:35.054331', 'step': 629, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:35.088216', 'step': 629, 'epoch': 1} {'type': 'loss', 'content': 0.030554279685020447, 'timestamp': '2025-09-15 03:19:35.090620', 'step': 630, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:35.121531', 'step': 630, 'epoch': 1} {'type': 'loss', 'content': 0.01749667339026928, 'timestamp': '2025-09-15 03:19:35.123672', 'step': 631, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:35.154520', 'step': 631, 'epoch': 1} {'type': 'loss', 'content': 0.015980729833245277, 'timestamp': '2025-09-15 03:19:35.180987', 'step': 632, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:35.219404', 'step': 632, 'epoch': 1} {'type': 'loss', 'content': 0.005738109350204468, 'timestamp': '2025-09-15 03:19:35.222413', 'step': 633, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:35.252946', 'step': 633, 'epoch': 1} {'type': 'loss', 'content': 0.009976466186344624, 'timestamp': '2025-09-15 03:19:35.254880', 'step': 634, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:35.285492', 'step': 634, 'epoch': 1} {'type': 'loss', 'content': 0.02471822127699852, 'timestamp': '2025-09-15 03:19:35.287618', 'step': 635, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:35.317306', 'step': 635, 'epoch': 1} {'type': 'loss', 'content': 0.008112783543765545, 'timestamp': '2025-09-15 03:19:35.341896', 'step': 636, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:35.372410', 'step': 636, 'epoch': 1} {'type': 'loss', 'content': 0.0037000139709562063, 'timestamp': '2025-09-15 03:19:35.374188', 'step': 637, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:35.404221', 'step': 637, 'epoch': 1} {'type': 'loss', 'content': 0.032425593584775925, 'timestamp': '2025-09-15 03:19:35.406151', 'step': 638, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:35.436237', 'step': 638, 'epoch': 1} {'type': 'loss', 'content': 0.032084185630083084, 'timestamp': '2025-09-15 03:19:35.440432', 'step': 639, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:35.469637', 'step': 639, 'epoch': 1} {'type': 'loss', 'content': 0.003243770683184266, 'timestamp': '2025-09-15 03:19:35.499485', 'step': 640, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:35.529955', 'step': 640, 'epoch': 1} {'type': 'loss', 'content': 0.009236874990165234, 'timestamp': '2025-09-15 03:19:35.532339', 'step': 641, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:35.562536', 'step': 641, 'epoch': 1} {'type': 'loss', 'content': 0.016537828370928764, 'timestamp': '2025-09-15 03:19:35.564759', 'step': 642, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:35.599429', 'step': 642, 'epoch': 1} {'type': 'loss', 'content': 0.007999381981790066, 'timestamp': '2025-09-15 03:19:35.601425', 'step': 643, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:35.631660', 'step': 643, 'epoch': 1} {'type': 'loss', 'content': 0.03624844551086426, 'timestamp': '2025-09-15 03:19:35.655749', 'step': 644, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:35.687889', 'step': 644, 'epoch': 1} {'type': 'loss', 'content': 0.0090530039742589, 'timestamp': '2025-09-15 03:19:35.689793', 'step': 645, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:35.719723', 'step': 645, 'epoch': 1} {'type': 'loss', 'content': 0.03531469404697418, 'timestamp': '2025-09-15 03:19:35.721763', 'step': 646, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:35.752701', 'step': 646, 'epoch': 1} {'type': 'loss', 'content': 0.007365099154412746, 'timestamp': '2025-09-15 03:19:35.755023', 'step': 647, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:35.785323', 'step': 647, 'epoch': 1} {'type': 'loss', 'content': 0.006562412716448307, 'timestamp': '2025-09-15 03:19:35.808615', 'step': 648, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:35.838453', 'step': 648, 'epoch': 1} {'type': 'loss', 'content': 0.0090226074680686, 'timestamp': '2025-09-15 03:19:35.840473', 'step': 649, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:35.870557', 'step': 649, 'epoch': 1} {'type': 'loss', 'content': 0.006006971467286348, 'timestamp': '2025-09-15 03:19:35.872732', 'step': 650, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:35.902827', 'step': 650, 'epoch': 1} {'type': 'loss', 'content': 0.017953816801309586, 'timestamp': '2025-09-15 03:19:35.904940', 'step': 651, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:35.934800', 'step': 651, 'epoch': 1} {'type': 'loss', 'content': 0.014853633008897305, 'timestamp': '2025-09-15 03:19:35.958290', 'step': 652, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:35.988599', 'step': 652, 'epoch': 1} {'type': 'loss', 'content': 0.01929437555372715, 'timestamp': '2025-09-15 03:19:35.990404', 'step': 653, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:36.020135', 'step': 653, 'epoch': 1} {'type': 'loss', 'content': 0.027933437377214432, 'timestamp': '2025-09-15 03:19:36.022258', 'step': 654, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:36.053501', 'step': 654, 'epoch': 1} {'type': 'loss', 'content': 0.007303243037313223, 'timestamp': '2025-09-15 03:19:36.056158', 'step': 655, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:36.086539', 'step': 655, 'epoch': 1} {'type': 'loss', 'content': 0.005802966188639402, 'timestamp': '2025-09-15 03:19:36.109977', 'step': 656, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:36.139863', 'step': 656, 'epoch': 1} {'type': 'loss', 'content': 0.0036463297437876463, 'timestamp': '2025-09-15 03:19:36.142126', 'step': 657, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:36.171715', 'step': 657, 'epoch': 1} {'type': 'loss', 'content': 0.01936980150640011, 'timestamp': '2025-09-15 03:19:36.173881', 'step': 658, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:36.206242', 'step': 658, 'epoch': 1} {'type': 'loss', 'content': 0.005909489002078772, 'timestamp': '2025-09-15 03:19:36.208350', 'step': 659, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:36.237937', 'step': 659, 'epoch': 1} {'type': 'loss', 'content': 0.011962154880166054, 'timestamp': '2025-09-15 03:19:36.261456', 'step': 660, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:36.292248', 'step': 660, 'epoch': 1} {'type': 'loss', 'content': 0.010646837763488293, 'timestamp': '2025-09-15 03:19:36.294001', 'step': 661, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:36.324433', 'step': 661, 'epoch': 1} {'type': 'loss', 'content': 0.004621988628059626, 'timestamp': '2025-09-15 03:19:36.326545', 'step': 662, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:36.356674', 'step': 662, 'epoch': 1} {'type': 'loss', 'content': 0.01572437211871147, 'timestamp': '2025-09-15 03:19:36.359594', 'step': 663, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:36.389546', 'step': 663, 'epoch': 1} {'type': 'loss', 'content': 0.01924065686762333, 'timestamp': '2025-09-15 03:19:36.413356', 'step': 664, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:36.443333', 'step': 664, 'epoch': 1} {'type': 'loss', 'content': 0.009026370011270046, 'timestamp': '2025-09-15 03:19:36.445178', 'step': 665, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:36.475161', 'step': 665, 'epoch': 1} {'type': 'loss', 'content': 0.006555170752108097, 'timestamp': '2025-09-15 03:19:36.477114', 'step': 666, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:36.508774', 'step': 666, 'epoch': 1} {'type': 'loss', 'content': 0.012694472447037697, 'timestamp': '2025-09-15 03:19:36.510930', 'step': 667, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:36.541487', 'step': 667, 'epoch': 1} {'type': 'loss', 'content': 0.012330038473010063, 'timestamp': '2025-09-15 03:19:36.565046', 'step': 668, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:36.595113', 'step': 668, 'epoch': 1} {'type': 'loss', 'content': 0.04121780022978783, 'timestamp': '2025-09-15 03:19:36.597217', 'step': 669, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:36.627506', 'step': 669, 'epoch': 1} {'type': 'loss', 'content': 0.05309586599469185, 'timestamp': '2025-09-15 03:19:36.629586', 'step': 670, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:36.659708', 'step': 670, 'epoch': 1} {'type': 'loss', 'content': 0.006368253845721483, 'timestamp': '2025-09-15 03:19:36.662648', 'step': 671, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:36.692272', 'step': 671, 'epoch': 1} {'type': 'loss', 'content': 0.019130660220980644, 'timestamp': '2025-09-15 03:19:36.715694', 'step': 672, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:36.746500', 'step': 672, 'epoch': 1} {'type': 'loss', 'content': 0.024251120164990425, 'timestamp': '2025-09-15 03:19:36.748928', 'step': 673, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:36.778732', 'step': 673, 'epoch': 1} {'type': 'loss', 'content': 0.025649379938840866, 'timestamp': '2025-09-15 03:19:36.780936', 'step': 674, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:36.810782', 'step': 674, 'epoch': 1} {'type': 'loss', 'content': 0.04780502989888191, 'timestamp': '2025-09-15 03:19:36.812741', 'step': 675, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:36.843220', 'step': 675, 'epoch': 1} {'type': 'loss', 'content': 0.012879802845418453, 'timestamp': '2025-09-15 03:19:36.866538', 'step': 676, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:36.896279', 'step': 676, 'epoch': 1} {'type': 'loss', 'content': 0.0165265966206789, 'timestamp': '2025-09-15 03:19:36.898458', 'step': 677, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:36.928417', 'step': 677, 'epoch': 1} {'type': 'loss', 'content': 0.009652511216700077, 'timestamp': '2025-09-15 03:19:36.930824', 'step': 678, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:36.961659', 'step': 678, 'epoch': 1} {'type': 'loss', 'content': 0.008517156355082989, 'timestamp': '2025-09-15 03:19:36.963827', 'step': 679, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:36.994853', 'step': 679, 'epoch': 1} {'type': 'loss', 'content': 0.012086872942745686, 'timestamp': '2025-09-15 03:19:37.018261', 'step': 680, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:37.047953', 'step': 680, 'epoch': 1} {'type': 'loss', 'content': 0.01267678290605545, 'timestamp': '2025-09-15 03:19:37.050126', 'step': 681, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:37.080491', 'step': 681, 'epoch': 1} {'type': 'loss', 'content': 0.014379492029547691, 'timestamp': '2025-09-15 03:19:37.082330', 'step': 682, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:37.111970', 'step': 682, 'epoch': 1} {'type': 'loss', 'content': 0.018130462616682053, 'timestamp': '2025-09-15 03:19:37.113977', 'step': 683, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:37.144365', 'step': 683, 'epoch': 1} {'type': 'loss', 'content': 0.009595629759132862, 'timestamp': '2025-09-15 03:19:37.168484', 'step': 684, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:19:37.880121', 'step': 684, 'epoch': 1} {'type': 'pplx', 'content': 110478225.79936634, 'timestamp': '2025-09-15 03:19:37.882397', 'step': 684, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:37.910669', 'step': 684, 'epoch': 1} {'type': 'loss', 'content': 0.03872322663664818, 'timestamp': '2025-09-15 03:19:37.912785', 'step': 685, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:37.942433', 'step': 685, 'epoch': 1} {'type': 'loss', 'content': 0.018750173971056938, 'timestamp': '2025-09-15 03:19:37.944777', 'step': 686, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:37.975193', 'step': 686, 'epoch': 1} {'type': 'loss', 'content': 0.0061281099915504456, 'timestamp': '2025-09-15 03:19:37.978429', 'step': 687, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:38.008224', 'step': 687, 'epoch': 1} {'type': 'loss', 'content': 0.006822456140071154, 'timestamp': '2025-09-15 03:19:38.031688', 'step': 688, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:38.061859', 'step': 688, 'epoch': 1} {'type': 'loss', 'content': 0.04175681620836258, 'timestamp': '2025-09-15 03:19:38.063998', 'step': 689, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:38.095345', 'step': 689, 'epoch': 1} {'type': 'loss', 'content': 0.00807065051048994, 'timestamp': '2025-09-15 03:19:38.097520', 'step': 690, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:38.129154', 'step': 690, 'epoch': 1} {'type': 'loss', 'content': 0.007711734157055616, 'timestamp': '2025-09-15 03:19:38.131369', 'step': 691, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:38.160717', 'step': 691, 'epoch': 1} {'type': 'loss', 'content': 0.00910022109746933, 'timestamp': '2025-09-15 03:19:38.184220', 'step': 692, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:38.214151', 'step': 692, 'epoch': 1} {'type': 'loss', 'content': 0.027539242058992386, 'timestamp': '2025-09-15 03:19:38.216324', 'step': 693, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:38.246949', 'step': 693, 'epoch': 1} {'type': 'loss', 'content': 0.047927696257829666, 'timestamp': '2025-09-15 03:19:38.249186', 'step': 694, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:38.278947', 'step': 694, 'epoch': 1} {'type': 'loss', 'content': 0.0026730988174676895, 'timestamp': '2025-09-15 03:19:38.281425', 'step': 695, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:38.311921', 'step': 695, 'epoch': 1} {'type': 'loss', 'content': 0.024083703756332397, 'timestamp': '2025-09-15 03:19:38.335624', 'step': 696, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:38.375662', 'step': 696, 'epoch': 1} {'type': 'loss', 'content': 0.008579043671488762, 'timestamp': '2025-09-15 03:19:38.378574', 'step': 697, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:38.409372', 'step': 697, 'epoch': 1} {'type': 'loss', 'content': 0.006062888540327549, 'timestamp': '2025-09-15 03:19:38.412058', 'step': 698, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:38.442720', 'step': 698, 'epoch': 1} {'type': 'loss', 'content': 0.023601248860359192, 'timestamp': '2025-09-15 03:19:38.445222', 'step': 699, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:38.476156', 'step': 699, 'epoch': 1} {'type': 'loss', 'content': 0.04036266356706619, 'timestamp': '2025-09-15 03:19:38.499944', 'step': 700, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:38.531441', 'step': 700, 'epoch': 1} {'type': 'loss', 'content': 0.00857632141560316, 'timestamp': '2025-09-15 03:19:38.533626', 'step': 701, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:38.564681', 'step': 701, 'epoch': 1} {'type': 'loss', 'content': 0.01594799943268299, 'timestamp': '2025-09-15 03:19:38.566998', 'step': 702, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:38.597747', 'step': 702, 'epoch': 1} {'type': 'loss', 'content': 0.0195697583258152, 'timestamp': '2025-09-15 03:19:38.600011', 'step': 703, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:38.630500', 'step': 703, 'epoch': 1} {'type': 'loss', 'content': 0.02024012990295887, 'timestamp': '2025-09-15 03:19:38.654044', 'step': 704, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:38.684706', 'step': 704, 'epoch': 1} {'type': 'loss', 'content': 0.022832201793789864, 'timestamp': '2025-09-15 03:19:38.688201', 'step': 705, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:38.718266', 'step': 705, 'epoch': 1} {'type': 'loss', 'content': 0.004162064287811518, 'timestamp': '2025-09-15 03:19:38.720720', 'step': 706, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:38.751034', 'step': 706, 'epoch': 1} {'type': 'loss', 'content': 0.0038514018524438143, 'timestamp': '2025-09-15 03:19:38.753103', 'step': 707, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:38.783498', 'step': 707, 'epoch': 1} {'type': 'loss', 'content': 0.01295088417828083, 'timestamp': '2025-09-15 03:19:38.807224', 'step': 708, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:38.838316', 'step': 708, 'epoch': 1} {'type': 'loss', 'content': 0.012839587405323982, 'timestamp': '2025-09-15 03:19:38.840761', 'step': 709, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:38.870848', 'step': 709, 'epoch': 1} {'type': 'loss', 'content': 0.04741484671831131, 'timestamp': '2025-09-15 03:19:38.873113', 'step': 710, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:19:38.903902', 'step': 710, 'epoch': 1} {'type': 'loss', 'content': 0.025619518011808395, 'timestamp': '2025-09-15 03:19:38.906239', 'step': 711, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:38.937428', 'step': 711, 'epoch': 1} {'type': 'loss', 'content': 0.03996429219841957, 'timestamp': '2025-09-15 03:19:38.961599', 'step': 712, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:38.991884', 'step': 712, 'epoch': 1} {'type': 'loss', 'content': 0.016623103991150856, 'timestamp': '2025-09-15 03:19:38.994023', 'step': 713, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:39.023874', 'step': 713, 'epoch': 1} {'type': 'loss', 'content': 0.010368691757321358, 'timestamp': '2025-09-15 03:19:39.026689', 'step': 714, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:39.056698', 'step': 714, 'epoch': 1} {'type': 'loss', 'content': 0.022392038255929947, 'timestamp': '2025-09-15 03:19:39.058863', 'step': 715, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:39.088619', 'step': 715, 'epoch': 1} {'type': 'loss', 'content': 0.03169969096779823, 'timestamp': '2025-09-15 03:19:39.112653', 'step': 716, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:39.143062', 'step': 716, 'epoch': 1} {'type': 'loss', 'content': 0.01695393957197666, 'timestamp': '2025-09-15 03:19:39.145326', 'step': 717, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:39.175095', 'step': 717, 'epoch': 1} {'type': 'loss', 'content': 0.020151188597083092, 'timestamp': '2025-09-15 03:19:39.177433', 'step': 718, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:39.208063', 'step': 718, 'epoch': 1} {'type': 'loss', 'content': 0.02938300557434559, 'timestamp': '2025-09-15 03:19:39.210073', 'step': 719, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:39.241620', 'step': 719, 'epoch': 1} {'type': 'loss', 'content': 0.03406095132231712, 'timestamp': '2025-09-15 03:19:39.265302', 'step': 720, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:39.295237', 'step': 720, 'epoch': 1} {'type': 'loss', 'content': 0.012514782138168812, 'timestamp': '2025-09-15 03:19:39.297712', 'step': 721, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:39.327634', 'step': 721, 'epoch': 1} {'type': 'loss', 'content': 0.013856396079063416, 'timestamp': '2025-09-15 03:19:39.329995', 'step': 722, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:39.360124', 'step': 722, 'epoch': 1} {'type': 'loss', 'content': 0.01550250593572855, 'timestamp': '2025-09-15 03:19:39.362316', 'step': 723, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:39.392400', 'step': 723, 'epoch': 1} {'type': 'loss', 'content': 0.02845914289355278, 'timestamp': '2025-09-15 03:19:39.416151', 'step': 724, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:39.446701', 'step': 724, 'epoch': 1} {'type': 'loss', 'content': 0.027011645957827568, 'timestamp': '2025-09-15 03:19:39.449256', 'step': 725, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:39.479549', 'step': 725, 'epoch': 1} {'type': 'loss', 'content': 0.022153319790959358, 'timestamp': '2025-09-15 03:19:39.481807', 'step': 726, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:39.512515', 'step': 726, 'epoch': 1} {'type': 'loss', 'content': 0.017665695399045944, 'timestamp': '2025-09-15 03:19:39.514908', 'step': 727, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:39.546049', 'step': 727, 'epoch': 1} {'type': 'loss', 'content': 0.015788642689585686, 'timestamp': '2025-09-15 03:19:39.569513', 'step': 728, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:39.599192', 'step': 728, 'epoch': 1} {'type': 'loss', 'content': 0.009126792661845684, 'timestamp': '2025-09-15 03:19:39.601371', 'step': 729, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:39.631684', 'step': 729, 'epoch': 1} {'type': 'loss', 'content': 0.02464197389781475, 'timestamp': '2025-09-15 03:19:39.633942', 'step': 730, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:39.663815', 'step': 730, 'epoch': 1} {'type': 'loss', 'content': 0.01288971770554781, 'timestamp': '2025-09-15 03:19:39.665990', 'step': 731, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:39.696417', 'step': 731, 'epoch': 1} {'type': 'loss', 'content': 0.023708462715148926, 'timestamp': '2025-09-15 03:19:39.720257', 'step': 732, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:39.751029', 'step': 732, 'epoch': 1} {'type': 'loss', 'content': 0.018586965277791023, 'timestamp': '2025-09-15 03:19:39.753307', 'step': 733, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:39.783625', 'step': 733, 'epoch': 1} {'type': 'loss', 'content': 0.015598964877426624, 'timestamp': '2025-09-15 03:19:39.785775', 'step': 734, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:39.816683', 'step': 734, 'epoch': 1} {'type': 'loss', 'content': 0.017110329121351242, 'timestamp': '2025-09-15 03:19:39.819271', 'step': 735, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:39.849995', 'step': 735, 'epoch': 1} {'type': 'loss', 'content': 0.013492013327777386, 'timestamp': '2025-09-15 03:19:39.873602', 'step': 736, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:39.903812', 'step': 736, 'epoch': 1} {'type': 'loss', 'content': 0.03329331800341606, 'timestamp': '2025-09-15 03:19:39.905917', 'step': 737, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:39.935792', 'step': 737, 'epoch': 1} {'type': 'loss', 'content': 0.009817318990826607, 'timestamp': '2025-09-15 03:19:39.938021', 'step': 738, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:39.968219', 'step': 738, 'epoch': 1} {'type': 'loss', 'content': 0.022194471210241318, 'timestamp': '2025-09-15 03:19:39.970551', 'step': 739, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:40.000627', 'step': 739, 'epoch': 1} {'type': 'loss', 'content': 0.031265296041965485, 'timestamp': '2025-09-15 03:19:40.024156', 'step': 740, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:40.054912', 'step': 740, 'epoch': 1} {'type': 'loss', 'content': 0.01729530841112137, 'timestamp': '2025-09-15 03:19:40.057327', 'step': 741, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:19:40.778644', 'step': 741, 'epoch': 1} {'type': 'pplx', 'content': 103717247.03669673, 'timestamp': '2025-09-15 03:19:40.780588', 'step': 741, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:40.808719', 'step': 741, 'epoch': 1} {'type': 'loss', 'content': 0.010400855913758278, 'timestamp': '2025-09-15 03:19:40.810891', 'step': 742, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:40.840610', 'step': 742, 'epoch': 1} {'type': 'loss', 'content': 0.015962064266204834, 'timestamp': '2025-09-15 03:19:40.843064', 'step': 743, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:40.873807', 'step': 743, 'epoch': 1} {'type': 'loss', 'content': 0.03265008702874184, 'timestamp': '2025-09-15 03:19:40.897447', 'step': 744, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:40.927413', 'step': 744, 'epoch': 1} {'type': 'loss', 'content': 0.02071802131831646, 'timestamp': '2025-09-15 03:19:40.929460', 'step': 745, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:40.960245', 'step': 745, 'epoch': 1} {'type': 'loss', 'content': 0.007692706771194935, 'timestamp': '2025-09-15 03:19:40.962279', 'step': 746, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:40.992394', 'step': 746, 'epoch': 1} {'type': 'loss', 'content': 0.007905025035142899, 'timestamp': '2025-09-15 03:19:40.994567', 'step': 747, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:41.024483', 'step': 747, 'epoch': 1} {'type': 'loss', 'content': 0.02939099445939064, 'timestamp': '2025-09-15 03:19:41.047734', 'step': 748, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:41.078688', 'step': 748, 'epoch': 1} {'type': 'loss', 'content': 0.01641055755317211, 'timestamp': '2025-09-15 03:19:41.080807', 'step': 749, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:41.111344', 'step': 749, 'epoch': 1} {'type': 'loss', 'content': 0.010218380019068718, 'timestamp': '2025-09-15 03:19:41.113405', 'step': 750, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:41.143853', 'step': 750, 'epoch': 1} {'type': 'loss', 'content': 0.02515035681426525, 'timestamp': '2025-09-15 03:19:41.145967', 'step': 751, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:41.175812', 'step': 751, 'epoch': 1} {'type': 'loss', 'content': 0.0318562313914299, 'timestamp': '2025-09-15 03:19:41.199260', 'step': 752, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:41.229048', 'step': 752, 'epoch': 1} {'type': 'loss', 'content': 0.017711641266942024, 'timestamp': '2025-09-15 03:19:41.231205', 'step': 753, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:41.261072', 'step': 753, 'epoch': 1} {'type': 'loss', 'content': 0.01047059241682291, 'timestamp': '2025-09-15 03:19:41.264903', 'step': 754, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:41.295027', 'step': 754, 'epoch': 1} {'type': 'loss', 'content': 0.013568080961704254, 'timestamp': '2025-09-15 03:19:41.297049', 'step': 755, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:41.327048', 'step': 755, 'epoch': 1} {'type': 'loss', 'content': 0.025309359654784203, 'timestamp': '2025-09-15 03:19:41.350361', 'step': 756, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:41.379873', 'step': 756, 'epoch': 1} {'type': 'loss', 'content': 0.007763172034174204, 'timestamp': '2025-09-15 03:19:41.381794', 'step': 757, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:41.412033', 'step': 757, 'epoch': 1} {'type': 'loss', 'content': 0.007784743793308735, 'timestamp': '2025-09-15 03:19:41.414104', 'step': 758, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:41.443993', 'step': 758, 'epoch': 1} {'type': 'loss', 'content': 0.011267587542533875, 'timestamp': '2025-09-15 03:19:41.446425', 'step': 759, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:41.476903', 'step': 759, 'epoch': 1} {'type': 'loss', 'content': 0.013863942585885525, 'timestamp': '2025-09-15 03:19:41.500438', 'step': 760, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:41.531170', 'step': 760, 'epoch': 1} {'type': 'loss', 'content': 0.006926204077899456, 'timestamp': '2025-09-15 03:19:41.533280', 'step': 761, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:19:41.563986', 'step': 761, 'epoch': 1} {'type': 'loss', 'content': 0.01246979646384716, 'timestamp': '2025-09-15 03:19:41.566514', 'step': 762, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:41.596790', 'step': 762, 'epoch': 1} {'type': 'loss', 'content': 0.029222693294286728, 'timestamp': '2025-09-15 03:19:41.598803', 'step': 763, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:41.629258', 'step': 763, 'epoch': 1} {'type': 'loss', 'content': 0.008983985520899296, 'timestamp': '2025-09-15 03:19:41.652444', 'step': 764, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:41.682490', 'step': 764, 'epoch': 1} {'type': 'loss', 'content': 0.010987072251737118, 'timestamp': '2025-09-15 03:19:41.684608', 'step': 765, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:41.714931', 'step': 765, 'epoch': 1} {'type': 'loss', 'content': 0.013884793035686016, 'timestamp': '2025-09-15 03:19:41.716900', 'step': 766, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:41.747942', 'step': 766, 'epoch': 1} {'type': 'loss', 'content': 0.007951878942549229, 'timestamp': '2025-09-15 03:19:41.750087', 'step': 767, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:41.782095', 'step': 767, 'epoch': 1} {'type': 'loss', 'content': 0.034430526196956635, 'timestamp': '2025-09-15 03:19:41.805391', 'step': 768, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:41.836765', 'step': 768, 'epoch': 1} {'type': 'loss', 'content': 0.01271742768585682, 'timestamp': '2025-09-15 03:19:41.838870', 'step': 769, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:41.869259', 'step': 769, 'epoch': 1} {'type': 'loss', 'content': 0.03543007746338844, 'timestamp': '2025-09-15 03:19:41.871586', 'step': 770, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:41.901840', 'step': 770, 'epoch': 1} {'type': 'loss', 'content': 0.021944111213088036, 'timestamp': '2025-09-15 03:19:41.903746', 'step': 771, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:41.934061', 'step': 771, 'epoch': 1} {'type': 'loss', 'content': 0.008409752510488033, 'timestamp': '2025-09-15 03:19:41.957375', 'step': 772, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:41.987312', 'step': 772, 'epoch': 1} {'type': 'loss', 'content': 0.04508158937096596, 'timestamp': '2025-09-15 03:19:41.989251', 'step': 773, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:42.019250', 'step': 773, 'epoch': 1} {'type': 'loss', 'content': 0.006456805858761072, 'timestamp': '2025-09-15 03:19:42.021464', 'step': 774, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:42.051437', 'step': 774, 'epoch': 1} {'type': 'loss', 'content': 0.028556156903505325, 'timestamp': '2025-09-15 03:19:42.053388', 'step': 775, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:42.083163', 'step': 775, 'epoch': 1} {'type': 'loss', 'content': 0.027395443990826607, 'timestamp': '2025-09-15 03:19:42.106623', 'step': 776, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:42.136557', 'step': 776, 'epoch': 1} {'type': 'loss', 'content': 0.006619160529226065, 'timestamp': '2025-09-15 03:19:42.138621', 'step': 777, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:42.168555', 'step': 777, 'epoch': 1} {'type': 'loss', 'content': 0.003444958943873644, 'timestamp': '2025-09-15 03:19:42.170628', 'step': 778, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:42.200353', 'step': 778, 'epoch': 1} {'type': 'loss', 'content': 0.02690565027296543, 'timestamp': '2025-09-15 03:19:42.202531', 'step': 779, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:42.232264', 'step': 779, 'epoch': 1} {'type': 'loss', 'content': 0.006759346928447485, 'timestamp': '2025-09-15 03:19:42.255684', 'step': 780, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:42.285862', 'step': 780, 'epoch': 1} {'type': 'loss', 'content': 0.02217439003288746, 'timestamp': '2025-09-15 03:19:42.287802', 'step': 781, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:42.318224', 'step': 781, 'epoch': 1} {'type': 'loss', 'content': 0.014988348819315434, 'timestamp': '2025-09-15 03:19:42.320526', 'step': 782, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:42.350429', 'step': 782, 'epoch': 1} {'type': 'loss', 'content': 0.026887197047472, 'timestamp': '2025-09-15 03:19:42.352465', 'step': 783, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:42.383306', 'step': 783, 'epoch': 1} {'type': 'loss', 'content': 0.016281643882393837, 'timestamp': '2025-09-15 03:19:42.406709', 'step': 784, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:42.437336', 'step': 784, 'epoch': 1} {'type': 'loss', 'content': 0.006518169771879911, 'timestamp': '2025-09-15 03:19:42.439424', 'step': 785, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:42.469707', 'step': 785, 'epoch': 1} {'type': 'loss', 'content': 0.003812138456851244, 'timestamp': '2025-09-15 03:19:42.471716', 'step': 786, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:42.501641', 'step': 786, 'epoch': 1} {'type': 'loss', 'content': 0.004710691515356302, 'timestamp': '2025-09-15 03:19:42.503901', 'step': 787, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:42.534023', 'step': 787, 'epoch': 1} {'type': 'loss', 'content': 0.026245180517435074, 'timestamp': '2025-09-15 03:19:42.558608', 'step': 788, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:42.588519', 'step': 788, 'epoch': 1} {'type': 'loss', 'content': 0.011165949515998363, 'timestamp': '2025-09-15 03:19:42.590814', 'step': 789, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:42.620824', 'step': 789, 'epoch': 1} {'type': 'loss', 'content': 0.003821011632680893, 'timestamp': '2025-09-15 03:19:42.623018', 'step': 790, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:42.653068', 'step': 790, 'epoch': 1} {'type': 'loss', 'content': 0.017458414658904076, 'timestamp': '2025-09-15 03:19:42.655132', 'step': 791, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:42.685327', 'step': 791, 'epoch': 1} {'type': 'loss', 'content': 0.02229154482483864, 'timestamp': '2025-09-15 03:19:42.708863', 'step': 792, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:42.738929', 'step': 792, 'epoch': 1} {'type': 'loss', 'content': 0.009810912422835827, 'timestamp': '2025-09-15 03:19:42.741257', 'step': 793, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:42.770555', 'step': 793, 'epoch': 1} {'type': 'loss', 'content': 0.04138774797320366, 'timestamp': '2025-09-15 03:19:42.772514', 'step': 794, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:42.801904', 'step': 794, 'epoch': 1} {'type': 'loss', 'content': 0.014048370532691479, 'timestamp': '2025-09-15 03:19:42.803777', 'step': 795, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:42.833875', 'step': 795, 'epoch': 1} {'type': 'loss', 'content': 0.0071984389796853065, 'timestamp': '2025-09-15 03:19:42.857493', 'step': 796, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:42.887716', 'step': 796, 'epoch': 1} {'type': 'loss', 'content': 0.008396588265895844, 'timestamp': '2025-09-15 03:19:42.889942', 'step': 797, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:42.920444', 'step': 797, 'epoch': 1} {'type': 'loss', 'content': 0.014851844869554043, 'timestamp': '2025-09-15 03:19:42.922534', 'step': 798, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:19:43.629519', 'step': 798, 'epoch': 1} {'type': 'pplx', 'content': 104045215.20344174, 'timestamp': '2025-09-15 03:19:43.631597', 'step': 798, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:43.660045', 'step': 798, 'epoch': 1} {'type': 'loss', 'content': 0.005125641357153654, 'timestamp': '2025-09-15 03:19:43.662088', 'step': 799, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:43.691708', 'step': 799, 'epoch': 1} {'type': 'loss', 'content': 0.03359196335077286, 'timestamp': '2025-09-15 03:19:43.715256', 'step': 800, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:43.745226', 'step': 800, 'epoch': 1} {'type': 'loss', 'content': 0.024786178022623062, 'timestamp': '2025-09-15 03:19:43.747145', 'step': 801, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:43.777091', 'step': 801, 'epoch': 1} {'type': 'loss', 'content': 0.030102457851171494, 'timestamp': '2025-09-15 03:19:43.779278', 'step': 802, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:43.808965', 'step': 802, 'epoch': 1} {'type': 'loss', 'content': 0.024250861257314682, 'timestamp': '2025-09-15 03:19:43.811257', 'step': 803, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:43.841172', 'step': 803, 'epoch': 1} {'type': 'loss', 'content': 0.008365518413484097, 'timestamp': '2025-09-15 03:19:43.864521', 'step': 804, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:43.894479', 'step': 804, 'epoch': 1} {'type': 'loss', 'content': 0.024482980370521545, 'timestamp': '2025-09-15 03:19:43.896537', 'step': 805, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:43.926254', 'step': 805, 'epoch': 1} {'type': 'loss', 'content': 0.026492753997445107, 'timestamp': '2025-09-15 03:19:43.928646', 'step': 806, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:43.958387', 'step': 806, 'epoch': 1} {'type': 'loss', 'content': 0.01094807032495737, 'timestamp': '2025-09-15 03:19:43.960359', 'step': 807, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:43.990492', 'step': 807, 'epoch': 1} {'type': 'loss', 'content': 0.01790359988808632, 'timestamp': '2025-09-15 03:19:44.013933', 'step': 808, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:44.044146', 'step': 808, 'epoch': 1} {'type': 'loss', 'content': 0.00928981602191925, 'timestamp': '2025-09-15 03:19:44.046286', 'step': 809, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:44.076052', 'step': 809, 'epoch': 1} {'type': 'loss', 'content': 0.01696016825735569, 'timestamp': '2025-09-15 03:19:44.078318', 'step': 810, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:44.108250', 'step': 810, 'epoch': 1} {'type': 'loss', 'content': 0.020789949223399162, 'timestamp': '2025-09-15 03:19:44.110252', 'step': 811, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:44.140116', 'step': 811, 'epoch': 1} {'type': 'loss', 'content': 0.012049756944179535, 'timestamp': '2025-09-15 03:19:44.163467', 'step': 812, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:44.193188', 'step': 812, 'epoch': 1} {'type': 'loss', 'content': 0.02744273841381073, 'timestamp': '2025-09-15 03:19:44.195207', 'step': 813, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:44.224712', 'step': 813, 'epoch': 1} {'type': 'loss', 'content': 0.014353885315358639, 'timestamp': '2025-09-15 03:19:44.226742', 'step': 814, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:44.256987', 'step': 814, 'epoch': 1} {'type': 'loss', 'content': 0.008923429064452648, 'timestamp': '2025-09-15 03:19:44.259241', 'step': 815, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:44.288686', 'step': 815, 'epoch': 1} {'type': 'loss', 'content': 0.009295600466430187, 'timestamp': '2025-09-15 03:19:44.312556', 'step': 816, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:44.342475', 'step': 816, 'epoch': 1} {'type': 'loss', 'content': 0.025669749826192856, 'timestamp': '2025-09-15 03:19:44.344563', 'step': 817, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:44.374499', 'step': 817, 'epoch': 1} {'type': 'loss', 'content': 0.013954401016235352, 'timestamp': '2025-09-15 03:19:44.376539', 'step': 818, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:44.406901', 'step': 818, 'epoch': 1} {'type': 'loss', 'content': 0.007388160564005375, 'timestamp': '2025-09-15 03:19:44.408833', 'step': 819, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:44.439325', 'step': 819, 'epoch': 1} {'type': 'loss', 'content': 0.014094533398747444, 'timestamp': '2025-09-15 03:19:44.462711', 'step': 820, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:44.492759', 'step': 820, 'epoch': 1} {'type': 'loss', 'content': 0.012995717115700245, 'timestamp': '2025-09-15 03:19:44.494912', 'step': 821, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:44.524843', 'step': 821, 'epoch': 1} {'type': 'loss', 'content': 0.01578564941883087, 'timestamp': '2025-09-15 03:19:44.526939', 'step': 822, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:44.557018', 'step': 822, 'epoch': 1} {'type': 'loss', 'content': 0.012877593748271465, 'timestamp': '2025-09-15 03:19:44.558916', 'step': 823, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:44.590517', 'step': 823, 'epoch': 1} {'type': 'loss', 'content': 0.02812553383409977, 'timestamp': '2025-09-15 03:19:44.614026', 'step': 824, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:44.643995', 'step': 824, 'epoch': 1} {'type': 'loss', 'content': 0.009813624434173107, 'timestamp': '2025-09-15 03:19:44.646241', 'step': 825, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:44.675963', 'step': 825, 'epoch': 1} {'type': 'loss', 'content': 0.01687939278781414, 'timestamp': '2025-09-15 03:19:44.678068', 'step': 826, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:44.707979', 'step': 826, 'epoch': 1} {'type': 'loss', 'content': 0.02816028706729412, 'timestamp': '2025-09-15 03:19:44.710193', 'step': 827, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:44.740965', 'step': 827, 'epoch': 1} {'type': 'loss', 'content': 0.011854671873152256, 'timestamp': '2025-09-15 03:19:44.764447', 'step': 828, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:44.795127', 'step': 828, 'epoch': 1} {'type': 'loss', 'content': 0.014840801246464252, 'timestamp': '2025-09-15 03:19:44.797196', 'step': 829, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:44.827052', 'step': 829, 'epoch': 1} {'type': 'loss', 'content': 0.004077043384313583, 'timestamp': '2025-09-15 03:19:44.829125', 'step': 830, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:44.858933', 'step': 830, 'epoch': 1} {'type': 'loss', 'content': 0.005604589823633432, 'timestamp': '2025-09-15 03:19:44.860942', 'step': 831, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:44.890927', 'step': 831, 'epoch': 1} {'type': 'loss', 'content': 0.004893143195658922, 'timestamp': '2025-09-15 03:19:44.914522', 'step': 832, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:44.944679', 'step': 832, 'epoch': 1} {'type': 'loss', 'content': 0.03003629483282566, 'timestamp': '2025-09-15 03:19:44.946780', 'step': 833, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:44.977071', 'step': 833, 'epoch': 1} {'type': 'loss', 'content': 0.01299560721963644, 'timestamp': '2025-09-15 03:19:44.979326', 'step': 834, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:45.009667', 'step': 834, 'epoch': 1} {'type': 'loss', 'content': 0.0031098301988095045, 'timestamp': '2025-09-15 03:19:45.011653', 'step': 835, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:45.042133', 'step': 835, 'epoch': 1} {'type': 'loss', 'content': 0.006905579008162022, 'timestamp': '2025-09-15 03:19:45.065417', 'step': 836, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:45.097673', 'step': 836, 'epoch': 1} {'type': 'loss', 'content': 0.040472351014614105, 'timestamp': '2025-09-15 03:19:45.100169', 'step': 837, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:45.130115', 'step': 837, 'epoch': 1} {'type': 'loss', 'content': 0.02735285460948944, 'timestamp': '2025-09-15 03:19:45.132218', 'step': 838, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:45.162417', 'step': 838, 'epoch': 1} {'type': 'loss', 'content': 0.03405022248625755, 'timestamp': '2025-09-15 03:19:45.164413', 'step': 839, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:45.193969', 'step': 839, 'epoch': 1} {'type': 'loss', 'content': 0.01648932322859764, 'timestamp': '2025-09-15 03:19:45.217348', 'step': 840, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:45.247358', 'step': 840, 'epoch': 1} {'type': 'loss', 'content': 0.018406063318252563, 'timestamp': '2025-09-15 03:19:45.249478', 'step': 841, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:45.279399', 'step': 841, 'epoch': 1} {'type': 'loss', 'content': 0.0345698781311512, 'timestamp': '2025-09-15 03:19:45.281422', 'step': 842, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:45.310699', 'step': 842, 'epoch': 1} {'type': 'loss', 'content': 0.015852032229304314, 'timestamp': '2025-09-15 03:19:45.313655', 'step': 843, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:45.344231', 'step': 843, 'epoch': 1} {'type': 'loss', 'content': 0.01953796111047268, 'timestamp': '2025-09-15 03:19:45.368112', 'step': 844, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:45.398869', 'step': 844, 'epoch': 1} {'type': 'loss', 'content': 0.005201183725148439, 'timestamp': '2025-09-15 03:19:45.401262', 'step': 845, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:45.431133', 'step': 845, 'epoch': 1} {'type': 'loss', 'content': 0.0030453321523964405, 'timestamp': '2025-09-15 03:19:45.433331', 'step': 846, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:45.463266', 'step': 846, 'epoch': 1} {'type': 'loss', 'content': 0.025405941531062126, 'timestamp': '2025-09-15 03:19:45.465257', 'step': 847, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:45.494939', 'step': 847, 'epoch': 1} {'type': 'loss', 'content': 0.02325121872127056, 'timestamp': '2025-09-15 03:19:45.518341', 'step': 848, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:45.548577', 'step': 848, 'epoch': 1} {'type': 'loss', 'content': 0.0039110551588237286, 'timestamp': '2025-09-15 03:19:45.550569', 'step': 849, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:45.580032', 'step': 849, 'epoch': 1} {'type': 'loss', 'content': 0.0093283262103796, 'timestamp': '2025-09-15 03:19:45.582138', 'step': 850, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:45.612011', 'step': 850, 'epoch': 1} {'type': 'loss', 'content': 0.007319420110434294, 'timestamp': '2025-09-15 03:19:45.613961', 'step': 851, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:45.644654', 'step': 851, 'epoch': 1} {'type': 'loss', 'content': 0.007478214800357819, 'timestamp': '2025-09-15 03:19:45.668145', 'step': 852, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:45.697634', 'step': 852, 'epoch': 1} {'type': 'loss', 'content': 0.007805486675351858, 'timestamp': '2025-09-15 03:19:45.699649', 'step': 853, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:45.729480', 'step': 853, 'epoch': 1} {'type': 'loss', 'content': 0.035240091383457184, 'timestamp': '2025-09-15 03:19:45.731466', 'step': 854, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:45.761510', 'step': 854, 'epoch': 1} {'type': 'loss', 'content': 0.010297401808202267, 'timestamp': '2025-09-15 03:19:45.763625', 'step': 855, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:19:46.472603', 'step': 855, 'epoch': 1} {'type': 'pplx', 'content': 100037921.68841042, 'timestamp': '2025-09-15 03:19:46.474452', 'step': 855, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:46.502434', 'step': 855, 'epoch': 1} {'type': 'loss', 'content': 0.00798899494111538, 'timestamp': '2025-09-15 03:19:46.525899', 'step': 856, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:46.556083', 'step': 856, 'epoch': 1} {'type': 'loss', 'content': 0.012356921099126339, 'timestamp': '2025-09-15 03:19:46.558117', 'step': 857, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:46.588087', 'step': 857, 'epoch': 1} {'type': 'loss', 'content': 0.009803662076592445, 'timestamp': '2025-09-15 03:19:46.590191', 'step': 858, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:46.620069', 'step': 858, 'epoch': 1} {'type': 'loss', 'content': 0.011618572287261486, 'timestamp': '2025-09-15 03:19:46.622059', 'step': 859, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:46.651554', 'step': 859, 'epoch': 1} {'type': 'loss', 'content': 0.03089841827750206, 'timestamp': '2025-09-15 03:19:46.675023', 'step': 860, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:46.704709', 'step': 860, 'epoch': 1} {'type': 'loss', 'content': 0.014122366905212402, 'timestamp': '2025-09-15 03:19:46.706682', 'step': 861, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:46.737170', 'step': 861, 'epoch': 1} {'type': 'loss', 'content': 0.006204747129231691, 'timestamp': '2025-09-15 03:19:46.739307', 'step': 862, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:46.769294', 'step': 862, 'epoch': 1} {'type': 'loss', 'content': 0.011782601475715637, 'timestamp': '2025-09-15 03:19:46.771364', 'step': 863, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:46.801556', 'step': 863, 'epoch': 1} {'type': 'loss', 'content': 0.010657857172191143, 'timestamp': '2025-09-15 03:19:46.825091', 'step': 864, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:46.854926', 'step': 864, 'epoch': 1} {'type': 'loss', 'content': 0.012917198240756989, 'timestamp': '2025-09-15 03:19:46.856991', 'step': 865, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:46.886751', 'step': 865, 'epoch': 1} {'type': 'loss', 'content': 0.002522421535104513, 'timestamp': '2025-09-15 03:19:46.888801', 'step': 866, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:46.919956', 'step': 866, 'epoch': 1} {'type': 'loss', 'content': 0.002517345128580928, 'timestamp': '2025-09-15 03:19:46.922000', 'step': 867, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:46.952565', 'step': 867, 'epoch': 1} {'type': 'loss', 'content': 0.004488280508667231, 'timestamp': '2025-09-15 03:19:46.975920', 'step': 868, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:47.006076', 'step': 868, 'epoch': 1} {'type': 'loss', 'content': 0.04148333519697189, 'timestamp': '2025-09-15 03:19:47.008298', 'step': 869, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:47.038485', 'step': 869, 'epoch': 1} {'type': 'loss', 'content': 0.07123760879039764, 'timestamp': '2025-09-15 03:19:47.040295', 'step': 870, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:47.070705', 'step': 870, 'epoch': 1} {'type': 'loss', 'content': 0.016277441754937172, 'timestamp': '2025-09-15 03:19:47.072732', 'step': 871, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:47.103374', 'step': 871, 'epoch': 1} {'type': 'loss', 'content': 0.00033685853122733533, 'timestamp': '2025-09-15 03:19:47.126806', 'step': 872, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:47.157549', 'step': 872, 'epoch': 1} {'type': 'loss', 'content': 0.029803471639752388, 'timestamp': '2025-09-15 03:19:47.159632', 'step': 873, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:47.189237', 'step': 873, 'epoch': 1} {'type': 'loss', 'content': 0.020926347002387047, 'timestamp': '2025-09-15 03:19:47.191261', 'step': 874, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:47.220936', 'step': 874, 'epoch': 1} {'type': 'loss', 'content': 0.03717362508177757, 'timestamp': '2025-09-15 03:19:47.222952', 'step': 875, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:47.253178', 'step': 875, 'epoch': 1} {'type': 'loss', 'content': 0.05333937332034111, 'timestamp': '2025-09-15 03:19:47.276570', 'step': 876, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:47.307079', 'step': 876, 'epoch': 1} {'type': 'loss', 'content': 0.0026524479035288095, 'timestamp': '2025-09-15 03:19:47.308993', 'step': 877, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:47.338628', 'step': 877, 'epoch': 1} {'type': 'loss', 'content': 0.0010080545907840133, 'timestamp': '2025-09-15 03:19:47.340905', 'step': 878, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:47.371024', 'step': 878, 'epoch': 1} {'type': 'loss', 'content': 0.046601030975580215, 'timestamp': '2025-09-15 03:19:47.373104', 'step': 879, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:47.403307', 'step': 879, 'epoch': 1} {'type': 'loss', 'content': 0.058754634112119675, 'timestamp': '2025-09-15 03:19:47.426954', 'step': 880, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:47.457145', 'step': 880, 'epoch': 1} {'type': 'loss', 'content': 0.017497990280389786, 'timestamp': '2025-09-15 03:19:47.459099', 'step': 881, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:47.488796', 'step': 881, 'epoch': 1} {'type': 'loss', 'content': 0.029213156551122665, 'timestamp': '2025-09-15 03:19:47.490708', 'step': 882, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:47.520081', 'step': 882, 'epoch': 1} {'type': 'loss', 'content': 0.019578823819756508, 'timestamp': '2025-09-15 03:19:47.522043', 'step': 883, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:47.551862', 'step': 883, 'epoch': 1} {'type': 'loss', 'content': 0.007458627223968506, 'timestamp': '2025-09-15 03:19:47.575402', 'step': 884, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:47.605497', 'step': 884, 'epoch': 1} {'type': 'loss', 'content': 0.03434719145298004, 'timestamp': '2025-09-15 03:19:47.607573', 'step': 885, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:47.637221', 'step': 885, 'epoch': 1} {'type': 'loss', 'content': 0.018169419839978218, 'timestamp': '2025-09-15 03:19:47.639256', 'step': 886, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:47.669823', 'step': 886, 'epoch': 1} {'type': 'loss', 'content': 0.04940139502286911, 'timestamp': '2025-09-15 03:19:47.671951', 'step': 887, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:47.701865', 'step': 887, 'epoch': 1} {'type': 'loss', 'content': 0.05026103928685188, 'timestamp': '2025-09-15 03:19:47.725309', 'step': 888, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:47.755323', 'step': 888, 'epoch': 1} {'type': 'loss', 'content': 0.026359498500823975, 'timestamp': '2025-09-15 03:19:47.757275', 'step': 889, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:47.787431', 'step': 889, 'epoch': 1} {'type': 'loss', 'content': 0.0597798153758049, 'timestamp': '2025-09-15 03:19:47.789332', 'step': 890, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:47.818782', 'step': 890, 'epoch': 1} {'type': 'loss', 'content': 0.012453628703951836, 'timestamp': '2025-09-15 03:19:47.820736', 'step': 891, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:47.851572', 'step': 891, 'epoch': 1} {'type': 'loss', 'content': 0.02922959066927433, 'timestamp': '2025-09-15 03:19:47.875077', 'step': 892, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:47.905722', 'step': 892, 'epoch': 1} {'type': 'loss', 'content': 0.019007423892617226, 'timestamp': '2025-09-15 03:19:47.907743', 'step': 893, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:47.937952', 'step': 893, 'epoch': 1} {'type': 'loss', 'content': 0.05699565261602402, 'timestamp': '2025-09-15 03:19:47.940014', 'step': 894, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:47.970485', 'step': 894, 'epoch': 1} {'type': 'loss', 'content': 0.01930629089474678, 'timestamp': '2025-09-15 03:19:47.972661', 'step': 895, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:48.002718', 'step': 895, 'epoch': 1} {'type': 'loss', 'content': 0.02485084906220436, 'timestamp': '2025-09-15 03:19:48.027890', 'step': 896, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:48.057767', 'step': 896, 'epoch': 1} {'type': 'loss', 'content': 0.02043440006673336, 'timestamp': '2025-09-15 03:19:48.059724', 'step': 897, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:48.089632', 'step': 897, 'epoch': 1} {'type': 'loss', 'content': 0.018073951825499535, 'timestamp': '2025-09-15 03:19:48.092145', 'step': 898, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:48.123315', 'step': 898, 'epoch': 1} {'type': 'loss', 'content': 0.017345238476991653, 'timestamp': '2025-09-15 03:19:48.125443', 'step': 899, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:48.155262', 'step': 899, 'epoch': 1} {'type': 'loss', 'content': 0.01569359377026558, 'timestamp': '2025-09-15 03:19:48.178793', 'step': 900, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:48.208871', 'step': 900, 'epoch': 1} {'type': 'loss', 'content': 0.014010734856128693, 'timestamp': '2025-09-15 03:19:48.210957', 'step': 901, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:48.241119', 'step': 901, 'epoch': 1} {'type': 'loss', 'content': 0.008070044219493866, 'timestamp': '2025-09-15 03:19:48.243230', 'step': 902, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:48.273844', 'step': 902, 'epoch': 1} {'type': 'loss', 'content': 0.019519077613949776, 'timestamp': '2025-09-15 03:19:48.275881', 'step': 903, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:48.305848', 'step': 903, 'epoch': 1} {'type': 'loss', 'content': 0.025401020422577858, 'timestamp': '2025-09-15 03:19:48.329904', 'step': 904, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:48.360223', 'step': 904, 'epoch': 1} {'type': 'loss', 'content': 0.0037331220228224993, 'timestamp': '2025-09-15 03:19:48.362199', 'step': 905, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:48.391499', 'step': 905, 'epoch': 1} {'type': 'loss', 'content': 0.010357382707297802, 'timestamp': '2025-09-15 03:19:48.393491', 'step': 906, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:48.423588', 'step': 906, 'epoch': 1} {'type': 'loss', 'content': 0.035081468522548676, 'timestamp': '2025-09-15 03:19:48.425849', 'step': 907, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:48.456323', 'step': 907, 'epoch': 1} {'type': 'loss', 'content': 0.023906197398900986, 'timestamp': '2025-09-15 03:19:48.479865', 'step': 908, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:48.509964', 'step': 908, 'epoch': 1} {'type': 'loss', 'content': 0.033607613295316696, 'timestamp': '2025-09-15 03:19:48.512927', 'step': 909, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:48.542927', 'step': 909, 'epoch': 1} {'type': 'loss', 'content': 0.0026440354995429516, 'timestamp': '2025-09-15 03:19:48.544963', 'step': 910, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:48.574828', 'step': 910, 'epoch': 1} {'type': 'loss', 'content': 0.0184736680239439, 'timestamp': '2025-09-15 03:19:48.577093', 'step': 911, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:48.606736', 'step': 911, 'epoch': 1} {'type': 'loss', 'content': 0.0037520730402320623, 'timestamp': '2025-09-15 03:19:48.630137', 'step': 912, 'epoch': 1} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:19:49.340710', 'step': 912, 'epoch': 1} {'type': 'pplx', 'content': 72598961.44010709, 'timestamp': '2025-09-15 03:19:49.342492', 'step': 912, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:49.371687', 'step': 912, 'epoch': 1} {'type': 'loss', 'content': 0.03960108011960983, 'timestamp': '2025-09-15 03:19:49.374066', 'step': 913, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:49.404380', 'step': 913, 'epoch': 1} {'type': 'loss', 'content': 0.017269397154450417, 'timestamp': '2025-09-15 03:19:49.406342', 'step': 914, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:49.436269', 'step': 914, 'epoch': 1} {'type': 'loss', 'content': 0.0040565552189946175, 'timestamp': '2025-09-15 03:19:49.437955', 'step': 915, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:49.468065', 'step': 915, 'epoch': 1} {'type': 'loss', 'content': 0.003966494929045439, 'timestamp': '2025-09-15 03:19:49.491431', 'step': 916, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:49.521051', 'step': 916, 'epoch': 1} {'type': 'loss', 'content': 0.034861061722040176, 'timestamp': '2025-09-15 03:19:49.522784', 'step': 917, 'epoch': 1} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:49.571945', 'step': 917, 'epoch': 2} {'type': 'loss', 'content': 0.039139218628406525, 'timestamp': '2025-09-15 03:19:49.573899', 'step': 918, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:49.603605', 'step': 918, 'epoch': 2} {'type': 'loss', 'content': 0.017292311415076256, 'timestamp': '2025-09-15 03:19:49.605430', 'step': 919, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:49.634924', 'step': 919, 'epoch': 2} {'type': 'loss', 'content': 0.05987069010734558, 'timestamp': '2025-09-15 03:19:49.658535', 'step': 920, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:49.688090', 'step': 920, 'epoch': 2} {'type': 'loss', 'content': 0.024779552593827248, 'timestamp': '2025-09-15 03:19:49.690092', 'step': 921, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:49.720099', 'step': 921, 'epoch': 2} {'type': 'loss', 'content': 0.03946017473936081, 'timestamp': '2025-09-15 03:19:49.722247', 'step': 922, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:49.752125', 'step': 922, 'epoch': 2} {'type': 'loss', 'content': 0.01998963952064514, 'timestamp': '2025-09-15 03:19:49.753990', 'step': 923, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:49.783600', 'step': 923, 'epoch': 2} {'type': 'loss', 'content': 0.030949410051107407, 'timestamp': '2025-09-15 03:19:49.806874', 'step': 924, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:49.837311', 'step': 924, 'epoch': 2} {'type': 'loss', 'content': 0.017210932448506355, 'timestamp': '2025-09-15 03:19:49.839370', 'step': 925, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:49.869458', 'step': 925, 'epoch': 2} {'type': 'loss', 'content': 0.011270995251834393, 'timestamp': '2025-09-15 03:19:49.871550', 'step': 926, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:49.901089', 'step': 926, 'epoch': 2} {'type': 'loss', 'content': 0.025099992752075195, 'timestamp': '2025-09-15 03:19:49.903307', 'step': 927, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:49.933067', 'step': 927, 'epoch': 2} {'type': 'loss', 'content': 0.020727096125483513, 'timestamp': '2025-09-15 03:19:49.956605', 'step': 928, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:49.986087', 'step': 928, 'epoch': 2} {'type': 'loss', 'content': 0.01741086132824421, 'timestamp': '2025-09-15 03:19:49.988451', 'step': 929, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:50.017823', 'step': 929, 'epoch': 2} {'type': 'loss', 'content': 0.019103452563285828, 'timestamp': '2025-09-15 03:19:50.020315', 'step': 930, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:50.049761', 'step': 930, 'epoch': 2} {'type': 'loss', 'content': 0.019196193665266037, 'timestamp': '2025-09-15 03:19:50.051441', 'step': 931, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:50.081837', 'step': 931, 'epoch': 2} {'type': 'loss', 'content': 0.023661328479647636, 'timestamp': '2025-09-15 03:19:50.105008', 'step': 932, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:50.134777', 'step': 932, 'epoch': 2} {'type': 'loss', 'content': 0.024797087535262108, 'timestamp': '2025-09-15 03:19:50.137051', 'step': 933, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:50.167971', 'step': 933, 'epoch': 2} {'type': 'loss', 'content': 0.025742126628756523, 'timestamp': '2025-09-15 03:19:50.169909', 'step': 934, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:50.199762', 'step': 934, 'epoch': 2} {'type': 'loss', 'content': 0.017478061839938164, 'timestamp': '2025-09-15 03:19:50.202235', 'step': 935, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:50.232848', 'step': 935, 'epoch': 2} {'type': 'loss', 'content': 0.023372257128357887, 'timestamp': '2025-09-15 03:19:50.256493', 'step': 936, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:50.286609', 'step': 936, 'epoch': 2} {'type': 'loss', 'content': 0.01836288534104824, 'timestamp': '2025-09-15 03:19:50.288514', 'step': 937, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:50.318671', 'step': 937, 'epoch': 2} {'type': 'loss', 'content': 0.02380347065627575, 'timestamp': '2025-09-15 03:19:50.320765', 'step': 938, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:50.351214', 'step': 938, 'epoch': 2} {'type': 'loss', 'content': 0.025880884379148483, 'timestamp': '2025-09-15 03:19:50.353295', 'step': 939, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:50.383964', 'step': 939, 'epoch': 2} {'type': 'loss', 'content': 0.018349649384617805, 'timestamp': '2025-09-15 03:19:50.407628', 'step': 940, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:50.437113', 'step': 940, 'epoch': 2} {'type': 'loss', 'content': 0.026017997413873672, 'timestamp': '2025-09-15 03:19:50.438917', 'step': 941, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:50.469082', 'step': 941, 'epoch': 2} {'type': 'loss', 'content': 0.01880783401429653, 'timestamp': '2025-09-15 03:19:50.470919', 'step': 942, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:50.501096', 'step': 942, 'epoch': 2} {'type': 'loss', 'content': 0.015078485012054443, 'timestamp': '2025-09-15 03:19:50.503273', 'step': 943, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:50.532688', 'step': 943, 'epoch': 2} {'type': 'loss', 'content': 0.017336489632725716, 'timestamp': '2025-09-15 03:19:50.555910', 'step': 944, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:50.585760', 'step': 944, 'epoch': 2} {'type': 'loss', 'content': 0.02135271206498146, 'timestamp': '2025-09-15 03:19:50.587553', 'step': 945, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:50.617823', 'step': 945, 'epoch': 2} {'type': 'loss', 'content': 0.018634533509612083, 'timestamp': '2025-09-15 03:19:50.619666', 'step': 946, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:50.649299', 'step': 946, 'epoch': 2} {'type': 'loss', 'content': 0.01942361891269684, 'timestamp': '2025-09-15 03:19:50.651036', 'step': 947, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:50.680156', 'step': 947, 'epoch': 2} {'type': 'loss', 'content': 0.02213677205145359, 'timestamp': '2025-09-15 03:19:50.703548', 'step': 948, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:50.733476', 'step': 948, 'epoch': 2} {'type': 'loss', 'content': 0.017390018329024315, 'timestamp': '2025-09-15 03:19:50.735496', 'step': 949, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:50.765761', 'step': 949, 'epoch': 2} {'type': 'loss', 'content': 0.008217746391892433, 'timestamp': '2025-09-15 03:19:50.768128', 'step': 950, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:50.797893', 'step': 950, 'epoch': 2} {'type': 'loss', 'content': 0.032679129391908646, 'timestamp': '2025-09-15 03:19:50.799998', 'step': 951, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:50.829730', 'step': 951, 'epoch': 2} {'type': 'loss', 'content': 0.00899417418986559, 'timestamp': '2025-09-15 03:19:50.853197', 'step': 952, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:50.882525', 'step': 952, 'epoch': 2} {'type': 'loss', 'content': 0.01678101159632206, 'timestamp': '2025-09-15 03:19:50.884330', 'step': 953, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:50.913859', 'step': 953, 'epoch': 2} {'type': 'loss', 'content': 0.01344042457640171, 'timestamp': '2025-09-15 03:19:50.915537', 'step': 954, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:50.945118', 'step': 954, 'epoch': 2} {'type': 'loss', 'content': 0.013032278046011925, 'timestamp': '2025-09-15 03:19:50.946942', 'step': 955, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:50.976642', 'step': 955, 'epoch': 2} {'type': 'loss', 'content': 0.007103900425136089, 'timestamp': '2025-09-15 03:19:50.999813', 'step': 956, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:51.029536', 'step': 956, 'epoch': 2} {'type': 'loss', 'content': 0.01115063764154911, 'timestamp': '2025-09-15 03:19:51.031414', 'step': 957, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:51.060862', 'step': 957, 'epoch': 2} {'type': 'loss', 'content': 0.0056912945583462715, 'timestamp': '2025-09-15 03:19:51.062490', 'step': 958, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:51.092578', 'step': 958, 'epoch': 2} {'type': 'loss', 'content': 0.018493881449103355, 'timestamp': '2025-09-15 03:19:51.094650', 'step': 959, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:51.124662', 'step': 959, 'epoch': 2} {'type': 'loss', 'content': 0.021584318950772285, 'timestamp': '2025-09-15 03:19:51.148141', 'step': 960, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:51.178184', 'step': 960, 'epoch': 2} {'type': 'loss', 'content': 0.0019218011293560266, 'timestamp': '2025-09-15 03:19:51.180035', 'step': 961, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:51.210201', 'step': 961, 'epoch': 2} {'type': 'loss', 'content': 0.003553554881364107, 'timestamp': '2025-09-15 03:19:51.212278', 'step': 962, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:51.244332', 'step': 962, 'epoch': 2} {'type': 'loss', 'content': 0.003442298388108611, 'timestamp': '2025-09-15 03:19:51.247387', 'step': 963, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:51.277085', 'step': 963, 'epoch': 2} {'type': 'loss', 'content': 0.022211765870451927, 'timestamp': '2025-09-15 03:19:51.300598', 'step': 964, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:51.330267', 'step': 964, 'epoch': 2} {'type': 'loss', 'content': 0.009360029362142086, 'timestamp': '2025-09-15 03:19:51.332114', 'step': 965, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:51.361890', 'step': 965, 'epoch': 2} {'type': 'loss', 'content': 0.001736950478516519, 'timestamp': '2025-09-15 03:19:51.363682', 'step': 966, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:51.393849', 'step': 966, 'epoch': 2} {'type': 'loss', 'content': 0.0288834385573864, 'timestamp': '2025-09-15 03:19:51.395951', 'step': 967, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:51.425707', 'step': 967, 'epoch': 2} {'type': 'loss', 'content': 0.0033125595655292273, 'timestamp': '2025-09-15 03:19:51.448812', 'step': 968, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:51.477964', 'step': 968, 'epoch': 2} {'type': 'loss', 'content': 0.0421050563454628, 'timestamp': '2025-09-15 03:19:51.479673', 'step': 969, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:19:52.188633', 'step': 969, 'epoch': 2} {'type': 'pplx', 'content': 63605872.170415215, 'timestamp': '2025-09-15 03:19:52.190783', 'step': 969, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:52.219146', 'step': 969, 'epoch': 2} {'type': 'loss', 'content': 0.03343040868639946, 'timestamp': '2025-09-15 03:19:52.221077', 'step': 970, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:52.251560', 'step': 970, 'epoch': 2} {'type': 'loss', 'content': 0.03231266885995865, 'timestamp': '2025-09-15 03:19:52.253617', 'step': 971, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:52.283909', 'step': 971, 'epoch': 2} {'type': 'loss', 'content': 0.01777302660048008, 'timestamp': '2025-09-15 03:19:52.307261', 'step': 972, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:52.337031', 'step': 972, 'epoch': 2} {'type': 'loss', 'content': 0.027679353952407837, 'timestamp': '2025-09-15 03:19:52.338850', 'step': 973, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:52.368361', 'step': 973, 'epoch': 2} {'type': 'loss', 'content': 0.016544125974178314, 'timestamp': '2025-09-15 03:19:52.370165', 'step': 974, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:52.399712', 'step': 974, 'epoch': 2} {'type': 'loss', 'content': 0.01474784966558218, 'timestamp': '2025-09-15 03:19:52.401506', 'step': 975, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:52.431277', 'step': 975, 'epoch': 2} {'type': 'loss', 'content': 0.021217798814177513, 'timestamp': '2025-09-15 03:19:52.454490', 'step': 976, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:52.483759', 'step': 976, 'epoch': 2} {'type': 'loss', 'content': 0.029132353141903877, 'timestamp': '2025-09-15 03:19:52.485573', 'step': 977, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:52.515487', 'step': 977, 'epoch': 2} {'type': 'loss', 'content': 0.010967997834086418, 'timestamp': '2025-09-15 03:19:52.517194', 'step': 978, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:52.547227', 'step': 978, 'epoch': 2} {'type': 'loss', 'content': 0.0184993464499712, 'timestamp': '2025-09-15 03:19:52.549416', 'step': 979, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:52.579279', 'step': 979, 'epoch': 2} {'type': 'loss', 'content': 0.0035073976032435894, 'timestamp': '2025-09-15 03:19:52.602463', 'step': 980, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:52.632206', 'step': 980, 'epoch': 2} {'type': 'loss', 'content': 0.004275471903383732, 'timestamp': '2025-09-15 03:19:52.634135', 'step': 981, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:52.663668', 'step': 981, 'epoch': 2} {'type': 'loss', 'content': 0.004823813680559397, 'timestamp': '2025-09-15 03:19:52.666112', 'step': 982, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:52.696134', 'step': 982, 'epoch': 2} {'type': 'loss', 'content': 0.03375326469540596, 'timestamp': '2025-09-15 03:19:52.698203', 'step': 983, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:52.728385', 'step': 983, 'epoch': 2} {'type': 'loss', 'content': 0.003615120192989707, 'timestamp': '2025-09-15 03:19:52.751951', 'step': 984, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:52.781703', 'step': 984, 'epoch': 2} {'type': 'loss', 'content': 0.023827284574508667, 'timestamp': '2025-09-15 03:19:52.783702', 'step': 985, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:52.815054', 'step': 985, 'epoch': 2} {'type': 'loss', 'content': 0.02614396996796131, 'timestamp': '2025-09-15 03:19:52.817057', 'step': 986, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:52.847061', 'step': 986, 'epoch': 2} {'type': 'loss', 'content': 0.02753402665257454, 'timestamp': '2025-09-15 03:19:52.849014', 'step': 987, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:52.878695', 'step': 987, 'epoch': 2} {'type': 'loss', 'content': 0.00410444475710392, 'timestamp': '2025-09-15 03:19:52.901828', 'step': 988, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:52.931324', 'step': 988, 'epoch': 2} {'type': 'loss', 'content': 0.03475097566843033, 'timestamp': '2025-09-15 03:19:52.933176', 'step': 989, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:52.963118', 'step': 989, 'epoch': 2} {'type': 'loss', 'content': 0.007826111279428005, 'timestamp': '2025-09-15 03:19:52.965301', 'step': 990, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:52.996524', 'step': 990, 'epoch': 2} {'type': 'loss', 'content': 0.0463503859937191, 'timestamp': '2025-09-15 03:19:52.998410', 'step': 991, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:53.027936', 'step': 991, 'epoch': 2} {'type': 'loss', 'content': 0.02141661010682583, 'timestamp': '2025-09-15 03:19:53.051030', 'step': 992, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:19:53.080958', 'step': 992, 'epoch': 2} {'type': 'loss', 'content': 0.021750137209892273, 'timestamp': '2025-09-15 03:19:53.083252', 'step': 993, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:53.113221', 'step': 993, 'epoch': 2} {'type': 'loss', 'content': 0.01930098421871662, 'timestamp': '2025-09-15 03:19:53.115183', 'step': 994, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:53.145170', 'step': 994, 'epoch': 2} {'type': 'loss', 'content': 0.0189370010048151, 'timestamp': '2025-09-15 03:19:53.147364', 'step': 995, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:53.177336', 'step': 995, 'epoch': 2} {'type': 'loss', 'content': 0.019197093322873116, 'timestamp': '2025-09-15 03:19:53.200739', 'step': 996, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:19:53.231394', 'step': 996, 'epoch': 2} {'type': 'loss', 'content': 0.020200148224830627, 'timestamp': '2025-09-15 03:19:53.233327', 'step': 997, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:53.262987', 'step': 997, 'epoch': 2} {'type': 'loss', 'content': 0.022475184872746468, 'timestamp': '2025-09-15 03:19:53.265309', 'step': 998, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:53.294892', 'step': 998, 'epoch': 2} {'type': 'loss', 'content': 0.024552693590521812, 'timestamp': '2025-09-15 03:19:53.296946', 'step': 999, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:19:53.326379', 'step': 999, 'epoch': 2} {'type': 'loss', 'content': 0.006172493565827608, 'timestamp': '2025-09-15 03:19:53.349406', 'step': 1000, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 1000', 'timestamp': '2025-09-15 03:20:00.254972', 'step': 1000, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:00.288498', 'step': 1000, 'epoch': 2} {'type': 'loss', 'content': 0.02578757330775261, 'timestamp': '2025-09-15 03:20:00.290858', 'step': 1001, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:00.322099', 'step': 1001, 'epoch': 2} {'type': 'loss', 'content': 0.021511312574148178, 'timestamp': '2025-09-15 03:20:00.324165', 'step': 1002, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:00.354323', 'step': 1002, 'epoch': 2} {'type': 'loss', 'content': 0.02874942310154438, 'timestamp': '2025-09-15 03:20:00.356530', 'step': 1003, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:00.386747', 'step': 1003, 'epoch': 2} {'type': 'loss', 'content': 0.013618089258670807, 'timestamp': '2025-09-15 03:20:00.410700', 'step': 1004, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:00.441085', 'step': 1004, 'epoch': 2} {'type': 'loss', 'content': 0.01972813531756401, 'timestamp': '2025-09-15 03:20:00.443308', 'step': 1005, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:00.473291', 'step': 1005, 'epoch': 2} {'type': 'loss', 'content': 0.027795910835266113, 'timestamp': '2025-09-15 03:20:00.475389', 'step': 1006, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:00.506108', 'step': 1006, 'epoch': 2} {'type': 'loss', 'content': 0.021912166848778725, 'timestamp': '2025-09-15 03:20:00.508306', 'step': 1007, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:00.538933', 'step': 1007, 'epoch': 2} {'type': 'loss', 'content': 0.03454792872071266, 'timestamp': '2025-09-15 03:20:00.562504', 'step': 1008, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:00.593899', 'step': 1008, 'epoch': 2} {'type': 'loss', 'content': 0.02996695600450039, 'timestamp': '2025-09-15 03:20:00.595860', 'step': 1009, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:00.625973', 'step': 1009, 'epoch': 2} {'type': 'loss', 'content': 0.018901299685239792, 'timestamp': '2025-09-15 03:20:00.628067', 'step': 1010, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:00.657906', 'step': 1010, 'epoch': 2} {'type': 'loss', 'content': 0.009113982319831848, 'timestamp': '2025-09-15 03:20:00.659997', 'step': 1011, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:00.691078', 'step': 1011, 'epoch': 2} {'type': 'loss', 'content': 0.030469568446278572, 'timestamp': '2025-09-15 03:20:00.714502', 'step': 1012, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:00.744564', 'step': 1012, 'epoch': 2} {'type': 'loss', 'content': 0.046722956001758575, 'timestamp': '2025-09-15 03:20:00.746632', 'step': 1013, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:00.776566', 'step': 1013, 'epoch': 2} {'type': 'loss', 'content': 0.019341707229614258, 'timestamp': '2025-09-15 03:20:00.778404', 'step': 1014, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:00.807850', 'step': 1014, 'epoch': 2} {'type': 'loss', 'content': 0.01661626063287258, 'timestamp': '2025-09-15 03:20:00.810043', 'step': 1015, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:00.840688', 'step': 1015, 'epoch': 2} {'type': 'loss', 'content': 0.008797021582722664, 'timestamp': '2025-09-15 03:20:00.864255', 'step': 1016, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:00.894247', 'step': 1016, 'epoch': 2} {'type': 'loss', 'content': 0.02718176133930683, 'timestamp': '2025-09-15 03:20:00.896243', 'step': 1017, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:00.925650', 'step': 1017, 'epoch': 2} {'type': 'loss', 'content': 0.020584603771567345, 'timestamp': '2025-09-15 03:20:00.927833', 'step': 1018, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:00.957848', 'step': 1018, 'epoch': 2} {'type': 'loss', 'content': 0.011983073316514492, 'timestamp': '2025-09-15 03:20:00.960083', 'step': 1019, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:00.990180', 'step': 1019, 'epoch': 2} {'type': 'loss', 'content': 0.017778072506189346, 'timestamp': '2025-09-15 03:20:01.013787', 'step': 1020, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:01.043857', 'step': 1020, 'epoch': 2} {'type': 'loss', 'content': 0.025459052994847298, 'timestamp': '2025-09-15 03:20:01.045827', 'step': 1021, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:01.075973', 'step': 1021, 'epoch': 2} {'type': 'loss', 'content': 0.035514552146196365, 'timestamp': '2025-09-15 03:20:01.077889', 'step': 1022, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:01.108193', 'step': 1022, 'epoch': 2} {'type': 'loss', 'content': 0.0033756468910723925, 'timestamp': '2025-09-15 03:20:01.110262', 'step': 1023, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:01.140585', 'step': 1023, 'epoch': 2} {'type': 'loss', 'content': 0.008607217110693455, 'timestamp': '2025-09-15 03:20:01.164036', 'step': 1024, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:01.194052', 'step': 1024, 'epoch': 2} {'type': 'loss', 'content': 0.03825444355607033, 'timestamp': '2025-09-15 03:20:01.195998', 'step': 1025, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:01.226391', 'step': 1025, 'epoch': 2} {'type': 'loss', 'content': 0.04544388875365257, 'timestamp': '2025-09-15 03:20:01.228431', 'step': 1026, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:20:01.951985', 'step': 1026, 'epoch': 2} {'type': 'pplx', 'content': 59711159.30161082, 'timestamp': '2025-09-15 03:20:01.954028', 'step': 1026, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:20:01.982504', 'step': 1026, 'epoch': 2} {'type': 'loss', 'content': 0.05692798271775246, 'timestamp': '2025-09-15 03:20:01.984823', 'step': 1027, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:02.015273', 'step': 1027, 'epoch': 2} {'type': 'loss', 'content': 0.003602404845878482, 'timestamp': '2025-09-15 03:20:02.038767', 'step': 1028, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:02.069047', 'step': 1028, 'epoch': 2} {'type': 'loss', 'content': 0.013803867623209953, 'timestamp': '2025-09-15 03:20:02.071440', 'step': 1029, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:02.101812', 'step': 1029, 'epoch': 2} {'type': 'loss', 'content': 0.012036988511681557, 'timestamp': '2025-09-15 03:20:02.104059', 'step': 1030, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:02.134478', 'step': 1030, 'epoch': 2} {'type': 'loss', 'content': 0.011974446475505829, 'timestamp': '2025-09-15 03:20:02.136849', 'step': 1031, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:02.167057', 'step': 1031, 'epoch': 2} {'type': 'loss', 'content': 0.0036804601550102234, 'timestamp': '2025-09-15 03:20:02.190434', 'step': 1032, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:02.220407', 'step': 1032, 'epoch': 2} {'type': 'loss', 'content': 0.01505777146667242, 'timestamp': '2025-09-15 03:20:02.222542', 'step': 1033, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:02.252522', 'step': 1033, 'epoch': 2} {'type': 'loss', 'content': 0.017127230763435364, 'timestamp': '2025-09-15 03:20:02.254610', 'step': 1034, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:02.284586', 'step': 1034, 'epoch': 2} {'type': 'loss', 'content': 0.015601657330989838, 'timestamp': '2025-09-15 03:20:02.286965', 'step': 1035, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:02.317206', 'step': 1035, 'epoch': 2} {'type': 'loss', 'content': 0.006081894971430302, 'timestamp': '2025-09-15 03:20:02.340566', 'step': 1036, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:02.370407', 'step': 1036, 'epoch': 2} {'type': 'loss', 'content': 0.00815503392368555, 'timestamp': '2025-09-15 03:20:02.372451', 'step': 1037, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:02.403853', 'step': 1037, 'epoch': 2} {'type': 'loss', 'content': 0.042893558740615845, 'timestamp': '2025-09-15 03:20:02.405886', 'step': 1038, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:02.437135', 'step': 1038, 'epoch': 2} {'type': 'loss', 'content': 0.017841657623648643, 'timestamp': '2025-09-15 03:20:02.439162', 'step': 1039, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:02.469180', 'step': 1039, 'epoch': 2} {'type': 'loss', 'content': 0.024145500734448433, 'timestamp': '2025-09-15 03:20:02.492792', 'step': 1040, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:02.523327', 'step': 1040, 'epoch': 2} {'type': 'loss', 'content': 0.014505532570183277, 'timestamp': '2025-09-15 03:20:02.525445', 'step': 1041, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:02.554955', 'step': 1041, 'epoch': 2} {'type': 'loss', 'content': 0.009613566100597382, 'timestamp': '2025-09-15 03:20:02.557049', 'step': 1042, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:02.587058', 'step': 1042, 'epoch': 2} {'type': 'loss', 'content': 0.04523482546210289, 'timestamp': '2025-09-15 03:20:02.590149', 'step': 1043, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:02.619880', 'step': 1043, 'epoch': 2} {'type': 'loss', 'content': 0.01229308545589447, 'timestamp': '2025-09-15 03:20:02.643746', 'step': 1044, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:02.674542', 'step': 1044, 'epoch': 2} {'type': 'loss', 'content': 0.006579564418643713, 'timestamp': '2025-09-15 03:20:02.676729', 'step': 1045, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:02.723819', 'step': 1045, 'epoch': 2} {'type': 'loss', 'content': 0.011694149114191532, 'timestamp': '2025-09-15 03:20:02.726307', 'step': 1046, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:02.756165', 'step': 1046, 'epoch': 2} {'type': 'loss', 'content': 0.03836764767765999, 'timestamp': '2025-09-15 03:20:02.758285', 'step': 1047, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:02.787816', 'step': 1047, 'epoch': 2} {'type': 'loss', 'content': 0.01350962370634079, 'timestamp': '2025-09-15 03:20:02.811173', 'step': 1048, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:02.842403', 'step': 1048, 'epoch': 2} {'type': 'loss', 'content': 0.008706008084118366, 'timestamp': '2025-09-15 03:20:02.844692', 'step': 1049, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:02.874765', 'step': 1049, 'epoch': 2} {'type': 'loss', 'content': 0.01117317657917738, 'timestamp': '2025-09-15 03:20:02.877109', 'step': 1050, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:02.907109', 'step': 1050, 'epoch': 2} {'type': 'loss', 'content': 0.026028618216514587, 'timestamp': '2025-09-15 03:20:02.909387', 'step': 1051, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:02.939090', 'step': 1051, 'epoch': 2} {'type': 'loss', 'content': 0.011078967712819576, 'timestamp': '2025-09-15 03:20:02.962999', 'step': 1052, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:02.992792', 'step': 1052, 'epoch': 2} {'type': 'loss', 'content': 0.0195362139493227, 'timestamp': '2025-09-15 03:20:02.995803', 'step': 1053, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:03.025872', 'step': 1053, 'epoch': 2} {'type': 'loss', 'content': 0.027163058519363403, 'timestamp': '2025-09-15 03:20:03.027935', 'step': 1054, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:03.057539', 'step': 1054, 'epoch': 2} {'type': 'loss', 'content': 0.029660746455192566, 'timestamp': '2025-09-15 03:20:03.059858', 'step': 1055, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:03.089819', 'step': 1055, 'epoch': 2} {'type': 'loss', 'content': 0.03723704442381859, 'timestamp': '2025-09-15 03:20:03.113177', 'step': 1056, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:03.142899', 'step': 1056, 'epoch': 2} {'type': 'loss', 'content': 0.019413789734244347, 'timestamp': '2025-09-15 03:20:03.145006', 'step': 1057, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:03.175594', 'step': 1057, 'epoch': 2} {'type': 'loss', 'content': 0.023310324177145958, 'timestamp': '2025-09-15 03:20:03.177732', 'step': 1058, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:03.208151', 'step': 1058, 'epoch': 2} {'type': 'loss', 'content': 0.03065226413309574, 'timestamp': '2025-09-15 03:20:03.210414', 'step': 1059, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:03.240256', 'step': 1059, 'epoch': 2} {'type': 'loss', 'content': 0.014415273442864418, 'timestamp': '2025-09-15 03:20:03.263736', 'step': 1060, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:03.294766', 'step': 1060, 'epoch': 2} {'type': 'loss', 'content': 0.014950952492654324, 'timestamp': '2025-09-15 03:20:03.297113', 'step': 1061, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:03.329486', 'step': 1061, 'epoch': 2} {'type': 'loss', 'content': 0.01662154495716095, 'timestamp': '2025-09-15 03:20:03.331787', 'step': 1062, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:03.363111', 'step': 1062, 'epoch': 2} {'type': 'loss', 'content': 0.0073923333548009396, 'timestamp': '2025-09-15 03:20:03.365302', 'step': 1063, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:03.396160', 'step': 1063, 'epoch': 2} {'type': 'loss', 'content': 0.010172292590141296, 'timestamp': '2025-09-15 03:20:03.419487', 'step': 1064, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:03.450602', 'step': 1064, 'epoch': 2} {'type': 'loss', 'content': 0.03431609272956848, 'timestamp': '2025-09-15 03:20:03.452623', 'step': 1065, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:03.483055', 'step': 1065, 'epoch': 2} {'type': 'loss', 'content': 0.02533714286983013, 'timestamp': '2025-09-15 03:20:03.485153', 'step': 1066, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:03.515326', 'step': 1066, 'epoch': 2} {'type': 'loss', 'content': 0.0259034913033247, 'timestamp': '2025-09-15 03:20:03.517558', 'step': 1067, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:03.547202', 'step': 1067, 'epoch': 2} {'type': 'loss', 'content': 0.011607496067881584, 'timestamp': '2025-09-15 03:20:03.570776', 'step': 1068, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:03.600671', 'step': 1068, 'epoch': 2} {'type': 'loss', 'content': 0.006037650164216757, 'timestamp': '2025-09-15 03:20:03.603619', 'step': 1069, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:03.633847', 'step': 1069, 'epoch': 2} {'type': 'loss', 'content': 0.014436027966439724, 'timestamp': '2025-09-15 03:20:03.635891', 'step': 1070, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:03.665951', 'step': 1070, 'epoch': 2} {'type': 'loss', 'content': 0.020872337743639946, 'timestamp': '2025-09-15 03:20:03.667991', 'step': 1071, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:03.698968', 'step': 1071, 'epoch': 2} {'type': 'loss', 'content': 0.024816256016492844, 'timestamp': '2025-09-15 03:20:03.722331', 'step': 1072, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:03.753776', 'step': 1072, 'epoch': 2} {'type': 'loss', 'content': 0.010879909619688988, 'timestamp': '2025-09-15 03:20:03.755802', 'step': 1073, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:03.786671', 'step': 1073, 'epoch': 2} {'type': 'loss', 'content': 0.04255342856049538, 'timestamp': '2025-09-15 03:20:03.788986', 'step': 1074, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:03.818928', 'step': 1074, 'epoch': 2} {'type': 'loss', 'content': 0.04912864789366722, 'timestamp': '2025-09-15 03:20:03.821220', 'step': 1075, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:03.850401', 'step': 1075, 'epoch': 2} {'type': 'loss', 'content': 0.046083271503448486, 'timestamp': '2025-09-15 03:20:03.873741', 'step': 1076, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:03.903722', 'step': 1076, 'epoch': 2} {'type': 'loss', 'content': 0.02562423050403595, 'timestamp': '2025-09-15 03:20:03.912649', 'step': 1077, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:03.947024', 'step': 1077, 'epoch': 2} {'type': 'loss', 'content': 0.023624440655112267, 'timestamp': '2025-09-15 03:20:03.949139', 'step': 1078, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:03.978694', 'step': 1078, 'epoch': 2} {'type': 'loss', 'content': 0.017809877172112465, 'timestamp': '2025-09-15 03:20:03.980881', 'step': 1079, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:04.010270', 'step': 1079, 'epoch': 2} {'type': 'loss', 'content': 0.018705327063798904, 'timestamp': '2025-09-15 03:20:04.033828', 'step': 1080, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:04.063874', 'step': 1080, 'epoch': 2} {'type': 'loss', 'content': 0.017726384103298187, 'timestamp': '2025-09-15 03:20:04.066214', 'step': 1081, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:04.096536', 'step': 1081, 'epoch': 2} {'type': 'loss', 'content': 0.030536355450749397, 'timestamp': '2025-09-15 03:20:04.098704', 'step': 1082, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:04.128905', 'step': 1082, 'epoch': 2} {'type': 'loss', 'content': 0.017621448263525963, 'timestamp': '2025-09-15 03:20:04.131006', 'step': 1083, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:20:04.839692', 'step': 1083, 'epoch': 2} {'type': 'pplx', 'content': 55375998.91955021, 'timestamp': '2025-09-15 03:20:04.842010', 'step': 1083, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:04.870342', 'step': 1083, 'epoch': 2} {'type': 'loss', 'content': 0.015300673432648182, 'timestamp': '2025-09-15 03:20:04.893914', 'step': 1084, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:04.923917', 'step': 1084, 'epoch': 2} {'type': 'loss', 'content': 0.006749466527253389, 'timestamp': '2025-09-15 03:20:04.925994', 'step': 1085, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:04.955631', 'step': 1085, 'epoch': 2} {'type': 'loss', 'content': 0.01821901462972164, 'timestamp': '2025-09-15 03:20:04.957923', 'step': 1086, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:04.988137', 'step': 1086, 'epoch': 2} {'type': 'loss', 'content': 0.028511449694633484, 'timestamp': '2025-09-15 03:20:04.990295', 'step': 1087, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:05.019956', 'step': 1087, 'epoch': 2} {'type': 'loss', 'content': 0.00882384367287159, 'timestamp': '2025-09-15 03:20:05.043536', 'step': 1088, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:05.073986', 'step': 1088, 'epoch': 2} {'type': 'loss', 'content': 0.028568703681230545, 'timestamp': '2025-09-15 03:20:05.076075', 'step': 1089, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:05.105716', 'step': 1089, 'epoch': 2} {'type': 'loss', 'content': 0.01979498565196991, 'timestamp': '2025-09-15 03:20:05.107895', 'step': 1090, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:05.138185', 'step': 1090, 'epoch': 2} {'type': 'loss', 'content': 0.0432886965572834, 'timestamp': '2025-09-15 03:20:05.140657', 'step': 1091, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:05.170573', 'step': 1091, 'epoch': 2} {'type': 'loss', 'content': 0.007654269225895405, 'timestamp': '2025-09-15 03:20:05.194149', 'step': 1092, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:05.224357', 'step': 1092, 'epoch': 2} {'type': 'loss', 'content': 0.01625342108309269, 'timestamp': '2025-09-15 03:20:05.226451', 'step': 1093, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:05.256212', 'step': 1093, 'epoch': 2} {'type': 'loss', 'content': 0.011958093382418156, 'timestamp': '2025-09-15 03:20:05.258352', 'step': 1094, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:05.290265', 'step': 1094, 'epoch': 2} {'type': 'loss', 'content': 0.010880166664719582, 'timestamp': '2025-09-15 03:20:05.292433', 'step': 1095, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:05.321917', 'step': 1095, 'epoch': 2} {'type': 'loss', 'content': 0.006026288028806448, 'timestamp': '2025-09-15 03:20:05.346549', 'step': 1096, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:05.378254', 'step': 1096, 'epoch': 2} {'type': 'loss', 'content': 0.01763133518397808, 'timestamp': '2025-09-15 03:20:05.380297', 'step': 1097, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:05.409340', 'step': 1097, 'epoch': 2} {'type': 'loss', 'content': 0.021360328420996666, 'timestamp': '2025-09-15 03:20:05.411697', 'step': 1098, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:05.441489', 'step': 1098, 'epoch': 2} {'type': 'loss', 'content': 0.020162245258688927, 'timestamp': '2025-09-15 03:20:05.443551', 'step': 1099, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:05.472964', 'step': 1099, 'epoch': 2} {'type': 'loss', 'content': 0.026214588433504105, 'timestamp': '2025-09-15 03:20:05.496728', 'step': 1100, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:05.527459', 'step': 1100, 'epoch': 2} {'type': 'loss', 'content': 0.027630941942334175, 'timestamp': '2025-09-15 03:20:05.529621', 'step': 1101, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:05.559110', 'step': 1101, 'epoch': 2} {'type': 'loss', 'content': 0.01204682793468237, 'timestamp': '2025-09-15 03:20:05.561272', 'step': 1102, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:05.595782', 'step': 1102, 'epoch': 2} {'type': 'loss', 'content': 0.021185899153351784, 'timestamp': '2025-09-15 03:20:05.598058', 'step': 1103, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:05.631793', 'step': 1103, 'epoch': 2} {'type': 'loss', 'content': 0.016678977757692337, 'timestamp': '2025-09-15 03:20:05.655330', 'step': 1104, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:05.684801', 'step': 1104, 'epoch': 2} {'type': 'loss', 'content': 0.01232249103486538, 'timestamp': '2025-09-15 03:20:05.687096', 'step': 1105, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:05.716678', 'step': 1105, 'epoch': 2} {'type': 'loss', 'content': 0.010624246671795845, 'timestamp': '2025-09-15 03:20:05.719051', 'step': 1106, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:05.748786', 'step': 1106, 'epoch': 2} {'type': 'loss', 'content': 0.02412816509604454, 'timestamp': '2025-09-15 03:20:05.750902', 'step': 1107, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:05.780579', 'step': 1107, 'epoch': 2} {'type': 'loss', 'content': 0.0221096184104681, 'timestamp': '2025-09-15 03:20:05.804000', 'step': 1108, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:05.834426', 'step': 1108, 'epoch': 2} {'type': 'loss', 'content': 0.010182474739849567, 'timestamp': '2025-09-15 03:20:05.836694', 'step': 1109, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:05.867243', 'step': 1109, 'epoch': 2} {'type': 'loss', 'content': 0.017153101041913033, 'timestamp': '2025-09-15 03:20:05.869479', 'step': 1110, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:05.899424', 'step': 1110, 'epoch': 2} {'type': 'loss', 'content': 0.01956060342490673, 'timestamp': '2025-09-15 03:20:05.901668', 'step': 1111, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:05.931341', 'step': 1111, 'epoch': 2} {'type': 'loss', 'content': 0.02774369716644287, 'timestamp': '2025-09-15 03:20:05.955226', 'step': 1112, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:05.985122', 'step': 1112, 'epoch': 2} {'type': 'loss', 'content': 0.021901097148656845, 'timestamp': '2025-09-15 03:20:05.987404', 'step': 1113, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:06.017078', 'step': 1113, 'epoch': 2} {'type': 'loss', 'content': 0.012037809006869793, 'timestamp': '2025-09-15 03:20:06.019277', 'step': 1114, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:06.049254', 'step': 1114, 'epoch': 2} {'type': 'loss', 'content': 0.021336954087018967, 'timestamp': '2025-09-15 03:20:06.051520', 'step': 1115, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:06.081661', 'step': 1115, 'epoch': 2} {'type': 'loss', 'content': 0.017799798399209976, 'timestamp': '2025-09-15 03:20:06.105196', 'step': 1116, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:06.134710', 'step': 1116, 'epoch': 2} {'type': 'loss', 'content': 0.02415151707828045, 'timestamp': '2025-09-15 03:20:06.136910', 'step': 1117, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:06.166921', 'step': 1117, 'epoch': 2} {'type': 'loss', 'content': 0.01389431394636631, 'timestamp': '2025-09-15 03:20:06.169292', 'step': 1118, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:06.198838', 'step': 1118, 'epoch': 2} {'type': 'loss', 'content': 0.01987137459218502, 'timestamp': '2025-09-15 03:20:06.200839', 'step': 1119, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:06.230517', 'step': 1119, 'epoch': 2} {'type': 'loss', 'content': 0.028128299862146378, 'timestamp': '2025-09-15 03:20:06.254036', 'step': 1120, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:06.283330', 'step': 1120, 'epoch': 2} {'type': 'loss', 'content': 0.0228290855884552, 'timestamp': '2025-09-15 03:20:06.285598', 'step': 1121, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:06.315935', 'step': 1121, 'epoch': 2} {'type': 'loss', 'content': 0.011218971572816372, 'timestamp': '2025-09-15 03:20:06.318052', 'step': 1122, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:06.348089', 'step': 1122, 'epoch': 2} {'type': 'loss', 'content': 0.014887683093547821, 'timestamp': '2025-09-15 03:20:06.350978', 'step': 1123, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:06.380778', 'step': 1123, 'epoch': 2} {'type': 'loss', 'content': 0.016286566853523254, 'timestamp': '2025-09-15 03:20:06.404378', 'step': 1124, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:06.434178', 'step': 1124, 'epoch': 2} {'type': 'loss', 'content': 0.008015617728233337, 'timestamp': '2025-09-15 03:20:06.436345', 'step': 1125, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:06.467907', 'step': 1125, 'epoch': 2} {'type': 'loss', 'content': 0.023351913318037987, 'timestamp': '2025-09-15 03:20:06.470131', 'step': 1126, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:06.499936', 'step': 1126, 'epoch': 2} {'type': 'loss', 'content': 0.02599175088107586, 'timestamp': '2025-09-15 03:20:06.501983', 'step': 1127, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:06.532023', 'step': 1127, 'epoch': 2} {'type': 'loss', 'content': 0.0075608291663229465, 'timestamp': '2025-09-15 03:20:06.555530', 'step': 1128, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:06.586133', 'step': 1128, 'epoch': 2} {'type': 'loss', 'content': 0.042395059019327164, 'timestamp': '2025-09-15 03:20:06.588495', 'step': 1129, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:06.619710', 'step': 1129, 'epoch': 2} {'type': 'loss', 'content': 0.027750244364142418, 'timestamp': '2025-09-15 03:20:06.621873', 'step': 1130, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:06.652013', 'step': 1130, 'epoch': 2} {'type': 'loss', 'content': 0.014860682189464569, 'timestamp': '2025-09-15 03:20:06.654137', 'step': 1131, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:06.683968', 'step': 1131, 'epoch': 2} {'type': 'loss', 'content': 0.01270745974034071, 'timestamp': '2025-09-15 03:20:06.707269', 'step': 1132, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:06.737889', 'step': 1132, 'epoch': 2} {'type': 'loss', 'content': 0.02112249843776226, 'timestamp': '2025-09-15 03:20:06.739991', 'step': 1133, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:06.771668', 'step': 1133, 'epoch': 2} {'type': 'loss', 'content': 0.0172148235142231, 'timestamp': '2025-09-15 03:20:06.773822', 'step': 1134, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:06.803888', 'step': 1134, 'epoch': 2} {'type': 'loss', 'content': 0.024268055334687233, 'timestamp': '2025-09-15 03:20:06.806230', 'step': 1135, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:06.836326', 'step': 1135, 'epoch': 2} {'type': 'loss', 'content': 0.01310635544359684, 'timestamp': '2025-09-15 03:20:06.860080', 'step': 1136, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:06.889990', 'step': 1136, 'epoch': 2} {'type': 'loss', 'content': 0.008171673864126205, 'timestamp': '2025-09-15 03:20:06.892003', 'step': 1137, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:06.921888', 'step': 1137, 'epoch': 2} {'type': 'loss', 'content': 0.017726849764585495, 'timestamp': '2025-09-15 03:20:06.923979', 'step': 1138, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:06.953873', 'step': 1138, 'epoch': 2} {'type': 'loss', 'content': 0.011989779770374298, 'timestamp': '2025-09-15 03:20:06.955940', 'step': 1139, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:06.986103', 'step': 1139, 'epoch': 2} {'type': 'loss', 'content': 0.020538654178380966, 'timestamp': '2025-09-15 03:20:07.009578', 'step': 1140, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:20:07.720501', 'step': 1140, 'epoch': 2} {'type': 'pplx', 'content': 58533645.06906351, 'timestamp': '2025-09-15 03:20:07.722844', 'step': 1140, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:07.751397', 'step': 1140, 'epoch': 2} {'type': 'loss', 'content': 0.027201279997825623, 'timestamp': '2025-09-15 03:20:07.754517', 'step': 1141, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:07.785506', 'step': 1141, 'epoch': 2} {'type': 'loss', 'content': 0.010129685513675213, 'timestamp': '2025-09-15 03:20:07.787641', 'step': 1142, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:07.817709', 'step': 1142, 'epoch': 2} {'type': 'loss', 'content': 0.01725381426513195, 'timestamp': '2025-09-15 03:20:07.819784', 'step': 1143, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:07.850024', 'step': 1143, 'epoch': 2} {'type': 'loss', 'content': 0.03179285675287247, 'timestamp': '2025-09-15 03:20:07.873600', 'step': 1144, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:07.906912', 'step': 1144, 'epoch': 2} {'type': 'loss', 'content': 0.005451020319014788, 'timestamp': '2025-09-15 03:20:07.909009', 'step': 1145, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:07.939031', 'step': 1145, 'epoch': 2} {'type': 'loss', 'content': 0.019774705171585083, 'timestamp': '2025-09-15 03:20:07.941412', 'step': 1146, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:07.971686', 'step': 1146, 'epoch': 2} {'type': 'loss', 'content': 0.0117212338373065, 'timestamp': '2025-09-15 03:20:07.973952', 'step': 1147, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:08.003950', 'step': 1147, 'epoch': 2} {'type': 'loss', 'content': 0.012994612567126751, 'timestamp': '2025-09-15 03:20:08.027561', 'step': 1148, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:08.057386', 'step': 1148, 'epoch': 2} {'type': 'loss', 'content': 0.02703903801739216, 'timestamp': '2025-09-15 03:20:08.060411', 'step': 1149, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:08.090655', 'step': 1149, 'epoch': 2} {'type': 'loss', 'content': 0.014843891374766827, 'timestamp': '2025-09-15 03:20:08.099855', 'step': 1150, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:08.132891', 'step': 1150, 'epoch': 2} {'type': 'loss', 'content': 0.03394562751054764, 'timestamp': '2025-09-15 03:20:08.135316', 'step': 1151, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:08.165110', 'step': 1151, 'epoch': 2} {'type': 'loss', 'content': 0.01329898927360773, 'timestamp': '2025-09-15 03:20:08.188693', 'step': 1152, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:08.218582', 'step': 1152, 'epoch': 2} {'type': 'loss', 'content': 0.011320680379867554, 'timestamp': '2025-09-15 03:20:08.220791', 'step': 1153, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:08.250814', 'step': 1153, 'epoch': 2} {'type': 'loss', 'content': 0.011235682293772697, 'timestamp': '2025-09-15 03:20:08.253892', 'step': 1154, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:08.284099', 'step': 1154, 'epoch': 2} {'type': 'loss', 'content': 0.021955570206046104, 'timestamp': '2025-09-15 03:20:08.286123', 'step': 1155, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:08.322440', 'step': 1155, 'epoch': 2} {'type': 'loss', 'content': 0.02674642577767372, 'timestamp': '2025-09-15 03:20:08.346116', 'step': 1156, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:08.376096', 'step': 1156, 'epoch': 2} {'type': 'loss', 'content': 0.01354469545185566, 'timestamp': '2025-09-15 03:20:08.378318', 'step': 1157, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:08.425883', 'step': 1157, 'epoch': 2} {'type': 'loss', 'content': 0.00968851987272501, 'timestamp': '2025-09-15 03:20:08.428236', 'step': 1158, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:08.458720', 'step': 1158, 'epoch': 2} {'type': 'loss', 'content': 0.013550050556659698, 'timestamp': '2025-09-15 03:20:08.461019', 'step': 1159, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:08.491579', 'step': 1159, 'epoch': 2} {'type': 'loss', 'content': 0.00861669797450304, 'timestamp': '2025-09-15 03:20:08.515133', 'step': 1160, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:08.545114', 'step': 1160, 'epoch': 2} {'type': 'loss', 'content': 0.007723599206656218, 'timestamp': '2025-09-15 03:20:08.547419', 'step': 1161, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:08.577516', 'step': 1161, 'epoch': 2} {'type': 'loss', 'content': 0.026734083890914917, 'timestamp': '2025-09-15 03:20:08.582763', 'step': 1162, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:08.613178', 'step': 1162, 'epoch': 2} {'type': 'loss', 'content': 0.01663447730243206, 'timestamp': '2025-09-15 03:20:08.615406', 'step': 1163, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:08.645653', 'step': 1163, 'epoch': 2} {'type': 'loss', 'content': 0.02458677813410759, 'timestamp': '2025-09-15 03:20:08.669304', 'step': 1164, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:08.698900', 'step': 1164, 'epoch': 2} {'type': 'loss', 'content': 0.01422831416130066, 'timestamp': '2025-09-15 03:20:08.702184', 'step': 1165, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:08.732012', 'step': 1165, 'epoch': 2} {'type': 'loss', 'content': 0.018267182633280754, 'timestamp': '2025-09-15 03:20:08.734407', 'step': 1166, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:08.764390', 'step': 1166, 'epoch': 2} {'type': 'loss', 'content': 0.010763383470475674, 'timestamp': '2025-09-15 03:20:08.768328', 'step': 1167, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:08.800328', 'step': 1167, 'epoch': 2} {'type': 'loss', 'content': 0.04157629609107971, 'timestamp': '2025-09-15 03:20:08.823943', 'step': 1168, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:08.853756', 'step': 1168, 'epoch': 2} {'type': 'loss', 'content': 0.015311472117900848, 'timestamp': '2025-09-15 03:20:08.856072', 'step': 1169, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:08.886614', 'step': 1169, 'epoch': 2} {'type': 'loss', 'content': 0.008150981739163399, 'timestamp': '2025-09-15 03:20:08.888892', 'step': 1170, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:08.920412', 'step': 1170, 'epoch': 2} {'type': 'loss', 'content': 0.014152586460113525, 'timestamp': '2025-09-15 03:20:08.922847', 'step': 1171, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:08.953027', 'step': 1171, 'epoch': 2} {'type': 'loss', 'content': 0.007704091724008322, 'timestamp': '2025-09-15 03:20:08.976592', 'step': 1172, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:20:09.013147', 'step': 1172, 'epoch': 2} {'type': 'loss', 'content': 0.015262416563928127, 'timestamp': '2025-09-15 03:20:09.015163', 'step': 1173, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:09.045465', 'step': 1173, 'epoch': 2} {'type': 'loss', 'content': 0.012681369669735432, 'timestamp': '2025-09-15 03:20:09.047517', 'step': 1174, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:09.077389', 'step': 1174, 'epoch': 2} {'type': 'loss', 'content': 0.020055728033185005, 'timestamp': '2025-09-15 03:20:09.079517', 'step': 1175, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:09.109478', 'step': 1175, 'epoch': 2} {'type': 'loss', 'content': 0.010338046588003635, 'timestamp': '2025-09-15 03:20:09.132776', 'step': 1176, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:09.164810', 'step': 1176, 'epoch': 2} {'type': 'loss', 'content': 0.026065126061439514, 'timestamp': '2025-09-15 03:20:09.172457', 'step': 1177, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:09.212075', 'step': 1177, 'epoch': 2} {'type': 'loss', 'content': 0.014396383427083492, 'timestamp': '2025-09-15 03:20:09.222229', 'step': 1178, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:09.255510', 'step': 1178, 'epoch': 2} {'type': 'loss', 'content': 0.027096567675471306, 'timestamp': '2025-09-15 03:20:09.259540', 'step': 1179, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:09.290560', 'step': 1179, 'epoch': 2} {'type': 'loss', 'content': 0.015955336391925812, 'timestamp': '2025-09-15 03:20:09.314189', 'step': 1180, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:09.344417', 'step': 1180, 'epoch': 2} {'type': 'loss', 'content': 0.030931200832128525, 'timestamp': '2025-09-15 03:20:09.346574', 'step': 1181, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:09.376598', 'step': 1181, 'epoch': 2} {'type': 'loss', 'content': 0.031671468168497086, 'timestamp': '2025-09-15 03:20:09.378877', 'step': 1182, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:09.409383', 'step': 1182, 'epoch': 2} {'type': 'loss', 'content': 0.015935173258185387, 'timestamp': '2025-09-15 03:20:09.411488', 'step': 1183, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:09.441534', 'step': 1183, 'epoch': 2} {'type': 'loss', 'content': 0.007875466719269753, 'timestamp': '2025-09-15 03:20:09.465515', 'step': 1184, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:09.495876', 'step': 1184, 'epoch': 2} {'type': 'loss', 'content': 0.021102117374539375, 'timestamp': '2025-09-15 03:20:09.498337', 'step': 1185, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:09.529458', 'step': 1185, 'epoch': 2} {'type': 'loss', 'content': 0.015338304452598095, 'timestamp': '2025-09-15 03:20:09.532557', 'step': 1186, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:09.566516', 'step': 1186, 'epoch': 2} {'type': 'loss', 'content': 0.0060651483945548534, 'timestamp': '2025-09-15 03:20:09.568660', 'step': 1187, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:09.598459', 'step': 1187, 'epoch': 2} {'type': 'loss', 'content': 0.009633776731789112, 'timestamp': '2025-09-15 03:20:09.623191', 'step': 1188, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:09.653423', 'step': 1188, 'epoch': 2} {'type': 'loss', 'content': 0.014106557704508305, 'timestamp': '2025-09-15 03:20:09.657651', 'step': 1189, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:09.688411', 'step': 1189, 'epoch': 2} {'type': 'loss', 'content': 0.027194740250706673, 'timestamp': '2025-09-15 03:20:09.691131', 'step': 1190, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:09.723940', 'step': 1190, 'epoch': 2} {'type': 'loss', 'content': 0.012174146249890327, 'timestamp': '2025-09-15 03:20:09.726496', 'step': 1191, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:09.757722', 'step': 1191, 'epoch': 2} {'type': 'loss', 'content': 0.01647491380572319, 'timestamp': '2025-09-15 03:20:09.781245', 'step': 1192, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:09.812233', 'step': 1192, 'epoch': 2} {'type': 'loss', 'content': 0.015903526917099953, 'timestamp': '2025-09-15 03:20:09.814379', 'step': 1193, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:09.845418', 'step': 1193, 'epoch': 2} {'type': 'loss', 'content': 0.0020331903360784054, 'timestamp': '2025-09-15 03:20:09.847565', 'step': 1194, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:09.877826', 'step': 1194, 'epoch': 2} {'type': 'loss', 'content': 0.044761087745428085, 'timestamp': '2025-09-15 03:20:09.879981', 'step': 1195, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:09.912771', 'step': 1195, 'epoch': 2} {'type': 'loss', 'content': 0.012461199425160885, 'timestamp': '2025-09-15 03:20:09.936396', 'step': 1196, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:09.966526', 'step': 1196, 'epoch': 2} {'type': 'loss', 'content': 0.05127660185098648, 'timestamp': '2025-09-15 03:20:09.968980', 'step': 1197, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:20:10.683931', 'step': 1197, 'epoch': 2} {'type': 'pplx', 'content': 67346794.62348509, 'timestamp': '2025-09-15 03:20:10.686006', 'step': 1197, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:10.715067', 'step': 1197, 'epoch': 2} {'type': 'loss', 'content': 0.006219768431037664, 'timestamp': '2025-09-15 03:20:10.717533', 'step': 1198, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:10.747307', 'step': 1198, 'epoch': 2} {'type': 'loss', 'content': 0.012562769465148449, 'timestamp': '2025-09-15 03:20:10.749519', 'step': 1199, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:10.779485', 'step': 1199, 'epoch': 2} {'type': 'loss', 'content': 0.019395234063267708, 'timestamp': '2025-09-15 03:20:10.803411', 'step': 1200, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:10.834066', 'step': 1200, 'epoch': 2} {'type': 'loss', 'content': 0.046573035418987274, 'timestamp': '2025-09-15 03:20:10.836131', 'step': 1201, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:10.866619', 'step': 1201, 'epoch': 2} {'type': 'loss', 'content': 0.0038583676796406507, 'timestamp': '2025-09-15 03:20:10.868969', 'step': 1202, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:10.899690', 'step': 1202, 'epoch': 2} {'type': 'loss', 'content': 0.002558397827669978, 'timestamp': '2025-09-15 03:20:10.902205', 'step': 1203, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:20:10.933138', 'step': 1203, 'epoch': 2} {'type': 'loss', 'content': 0.005045489873737097, 'timestamp': '2025-09-15 03:20:10.956734', 'step': 1204, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:10.989591', 'step': 1204, 'epoch': 2} {'type': 'loss', 'content': 0.04085572436451912, 'timestamp': '2025-09-15 03:20:10.991675', 'step': 1205, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:11.021099', 'step': 1205, 'epoch': 2} {'type': 'loss', 'content': 0.03398078307509422, 'timestamp': '2025-09-15 03:20:11.023213', 'step': 1206, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:11.052879', 'step': 1206, 'epoch': 2} {'type': 'loss', 'content': 0.0036212815903127193, 'timestamp': '2025-09-15 03:20:11.055072', 'step': 1207, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:11.085326', 'step': 1207, 'epoch': 2} {'type': 'loss', 'content': 0.015824228525161743, 'timestamp': '2025-09-15 03:20:11.109081', 'step': 1208, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:11.139648', 'step': 1208, 'epoch': 2} {'type': 'loss', 'content': 0.014186462387442589, 'timestamp': '2025-09-15 03:20:11.141703', 'step': 1209, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:11.171602', 'step': 1209, 'epoch': 2} {'type': 'loss', 'content': 0.016707351431250572, 'timestamp': '2025-09-15 03:20:11.173901', 'step': 1210, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:11.203784', 'step': 1210, 'epoch': 2} {'type': 'loss', 'content': 0.04144059866666794, 'timestamp': '2025-09-15 03:20:11.206167', 'step': 1211, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:11.235944', 'step': 1211, 'epoch': 2} {'type': 'loss', 'content': 0.013599964790046215, 'timestamp': '2025-09-15 03:20:11.259705', 'step': 1212, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:11.290162', 'step': 1212, 'epoch': 2} {'type': 'loss', 'content': 0.007864146493375301, 'timestamp': '2025-09-15 03:20:11.292299', 'step': 1213, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:11.321995', 'step': 1213, 'epoch': 2} {'type': 'loss', 'content': 0.009913211688399315, 'timestamp': '2025-09-15 03:20:11.324085', 'step': 1214, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:11.354466', 'step': 1214, 'epoch': 2} {'type': 'loss', 'content': 0.017688969150185585, 'timestamp': '2025-09-15 03:20:11.356650', 'step': 1215, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:11.386482', 'step': 1215, 'epoch': 2} {'type': 'loss', 'content': 0.023656124249100685, 'timestamp': '2025-09-15 03:20:11.410121', 'step': 1216, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:11.440142', 'step': 1216, 'epoch': 2} {'type': 'loss', 'content': 0.011671700514853, 'timestamp': '2025-09-15 03:20:11.442526', 'step': 1217, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:11.472326', 'step': 1217, 'epoch': 2} {'type': 'loss', 'content': 0.006454241927713156, 'timestamp': '2025-09-15 03:20:11.474413', 'step': 1218, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:11.504569', 'step': 1218, 'epoch': 2} {'type': 'loss', 'content': 0.007134907878935337, 'timestamp': '2025-09-15 03:20:11.506877', 'step': 1219, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:11.536463', 'step': 1219, 'epoch': 2} {'type': 'loss', 'content': 0.00786716677248478, 'timestamp': '2025-09-15 03:20:11.560023', 'step': 1220, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:11.589852', 'step': 1220, 'epoch': 2} {'type': 'loss', 'content': 0.011776251718401909, 'timestamp': '2025-09-15 03:20:11.592116', 'step': 1221, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:11.621964', 'step': 1221, 'epoch': 2} {'type': 'loss', 'content': 0.007592611480504274, 'timestamp': '2025-09-15 03:20:11.624201', 'step': 1222, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:11.654888', 'step': 1222, 'epoch': 2} {'type': 'loss', 'content': 0.008074778132140636, 'timestamp': '2025-09-15 03:20:11.657461', 'step': 1223, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:11.687227', 'step': 1223, 'epoch': 2} {'type': 'loss', 'content': 0.009150748141109943, 'timestamp': '2025-09-15 03:20:11.710887', 'step': 1224, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:11.741130', 'step': 1224, 'epoch': 2} {'type': 'loss', 'content': 0.014765610918402672, 'timestamp': '2025-09-15 03:20:11.743485', 'step': 1225, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:11.773640', 'step': 1225, 'epoch': 2} {'type': 'loss', 'content': 0.003772977739572525, 'timestamp': '2025-09-15 03:20:11.776200', 'step': 1226, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:11.806624', 'step': 1226, 'epoch': 2} {'type': 'loss', 'content': 0.009069901891052723, 'timestamp': '2025-09-15 03:20:11.809403', 'step': 1227, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:11.839226', 'step': 1227, 'epoch': 2} {'type': 'loss', 'content': 0.015366188250482082, 'timestamp': '2025-09-15 03:20:11.862951', 'step': 1228, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:11.892889', 'step': 1228, 'epoch': 2} {'type': 'loss', 'content': 0.00551746878772974, 'timestamp': '2025-09-15 03:20:11.895003', 'step': 1229, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:11.924579', 'step': 1229, 'epoch': 2} {'type': 'loss', 'content': 0.032362472265958786, 'timestamp': '2025-09-15 03:20:11.926748', 'step': 1230, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:11.958080', 'step': 1230, 'epoch': 2} {'type': 'loss', 'content': 0.008966001681983471, 'timestamp': '2025-09-15 03:20:11.960259', 'step': 1231, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:11.989742', 'step': 1231, 'epoch': 2} {'type': 'loss', 'content': 0.020234253257513046, 'timestamp': '2025-09-15 03:20:12.013381', 'step': 1232, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:12.043086', 'step': 1232, 'epoch': 2} {'type': 'loss', 'content': 0.0029388617258518934, 'timestamp': '2025-09-15 03:20:12.045211', 'step': 1233, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:12.074601', 'step': 1233, 'epoch': 2} {'type': 'loss', 'content': 0.008875792846083641, 'timestamp': '2025-09-15 03:20:12.076710', 'step': 1234, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:12.107652', 'step': 1234, 'epoch': 2} {'type': 'loss', 'content': 0.018813664093613625, 'timestamp': '2025-09-15 03:20:12.111082', 'step': 1235, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:12.141094', 'step': 1235, 'epoch': 2} {'type': 'loss', 'content': 0.010143798775970936, 'timestamp': '2025-09-15 03:20:12.164663', 'step': 1236, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:12.194883', 'step': 1236, 'epoch': 2} {'type': 'loss', 'content': 0.0108027970418334, 'timestamp': '2025-09-15 03:20:12.197020', 'step': 1237, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:12.227585', 'step': 1237, 'epoch': 2} {'type': 'loss', 'content': 0.013201083056628704, 'timestamp': '2025-09-15 03:20:12.229748', 'step': 1238, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:12.260484', 'step': 1238, 'epoch': 2} {'type': 'loss', 'content': 0.02639073133468628, 'timestamp': '2025-09-15 03:20:12.262578', 'step': 1239, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:12.293155', 'step': 1239, 'epoch': 2} {'type': 'loss', 'content': 0.01808268018066883, 'timestamp': '2025-09-15 03:20:12.316858', 'step': 1240, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:12.347508', 'step': 1240, 'epoch': 2} {'type': 'loss', 'content': 0.006471390835940838, 'timestamp': '2025-09-15 03:20:12.349692', 'step': 1241, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:12.380560', 'step': 1241, 'epoch': 2} {'type': 'loss', 'content': 0.018432533368468285, 'timestamp': '2025-09-15 03:20:12.382822', 'step': 1242, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:12.414986', 'step': 1242, 'epoch': 2} {'type': 'loss', 'content': 0.022524258121848106, 'timestamp': '2025-09-15 03:20:12.417103', 'step': 1243, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:12.447432', 'step': 1243, 'epoch': 2} {'type': 'loss', 'content': 0.024223115295171738, 'timestamp': '2025-09-15 03:20:12.471119', 'step': 1244, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:20:12.501793', 'step': 1244, 'epoch': 2} {'type': 'loss', 'content': 0.026791978627443314, 'timestamp': '2025-09-15 03:20:12.503876', 'step': 1245, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:12.534728', 'step': 1245, 'epoch': 2} {'type': 'loss', 'content': 0.02190900780260563, 'timestamp': '2025-09-15 03:20:12.536895', 'step': 1246, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:12.567362', 'step': 1246, 'epoch': 2} {'type': 'loss', 'content': 0.024278011173009872, 'timestamp': '2025-09-15 03:20:12.569589', 'step': 1247, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:12.599841', 'step': 1247, 'epoch': 2} {'type': 'loss', 'content': 0.010498437099158764, 'timestamp': '2025-09-15 03:20:12.623352', 'step': 1248, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:12.653854', 'step': 1248, 'epoch': 2} {'type': 'loss', 'content': 0.027558788657188416, 'timestamp': '2025-09-15 03:20:12.656104', 'step': 1249, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:12.686587', 'step': 1249, 'epoch': 2} {'type': 'loss', 'content': 0.029875023290514946, 'timestamp': '2025-09-15 03:20:12.688814', 'step': 1250, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:12.719071', 'step': 1250, 'epoch': 2} {'type': 'loss', 'content': 0.020839480683207512, 'timestamp': '2025-09-15 03:20:12.721354', 'step': 1251, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:12.751391', 'step': 1251, 'epoch': 2} {'type': 'loss', 'content': 0.009385906159877777, 'timestamp': '2025-09-15 03:20:12.775538', 'step': 1252, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:20:12.806477', 'step': 1252, 'epoch': 2} {'type': 'loss', 'content': 0.002834519138559699, 'timestamp': '2025-09-15 03:20:12.808704', 'step': 1253, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:12.839544', 'step': 1253, 'epoch': 2} {'type': 'loss', 'content': 0.010481811128556728, 'timestamp': '2025-09-15 03:20:12.841808', 'step': 1254, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:20:13.560081', 'step': 1254, 'epoch': 2} {'type': 'pplx', 'content': 69112101.07041304, 'timestamp': '2025-09-15 03:20:13.562009', 'step': 1254, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:13.591862', 'step': 1254, 'epoch': 2} {'type': 'loss', 'content': 0.007346840109676123, 'timestamp': '2025-09-15 03:20:13.594007', 'step': 1255, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:13.624216', 'step': 1255, 'epoch': 2} {'type': 'loss', 'content': 0.02451970987021923, 'timestamp': '2025-09-15 03:20:13.647861', 'step': 1256, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:13.678154', 'step': 1256, 'epoch': 2} {'type': 'loss', 'content': 0.016992518678307533, 'timestamp': '2025-09-15 03:20:13.680209', 'step': 1257, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:13.710544', 'step': 1257, 'epoch': 2} {'type': 'loss', 'content': 0.023409055545926094, 'timestamp': '2025-09-15 03:20:13.712656', 'step': 1258, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:13.742663', 'step': 1258, 'epoch': 2} {'type': 'loss', 'content': 0.008774764835834503, 'timestamp': '2025-09-15 03:20:13.745003', 'step': 1259, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:13.775108', 'step': 1259, 'epoch': 2} {'type': 'loss', 'content': 0.0132039999589324, 'timestamp': '2025-09-15 03:20:13.798607', 'step': 1260, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:13.829021', 'step': 1260, 'epoch': 2} {'type': 'loss', 'content': 0.05545142665505409, 'timestamp': '2025-09-15 03:20:13.831239', 'step': 1261, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:13.861887', 'step': 1261, 'epoch': 2} {'type': 'loss', 'content': 0.013375957496464252, 'timestamp': '2025-09-15 03:20:13.863961', 'step': 1262, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:13.894159', 'step': 1262, 'epoch': 2} {'type': 'loss', 'content': 0.020845327526330948, 'timestamp': '2025-09-15 03:20:13.896429', 'step': 1263, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:13.927009', 'step': 1263, 'epoch': 2} {'type': 'loss', 'content': 0.01673973724246025, 'timestamp': '2025-09-15 03:20:13.950498', 'step': 1264, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:13.980407', 'step': 1264, 'epoch': 2} {'type': 'loss', 'content': 0.008590168319642544, 'timestamp': '2025-09-15 03:20:13.982498', 'step': 1265, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:14.013816', 'step': 1265, 'epoch': 2} {'type': 'loss', 'content': 0.013757781125605106, 'timestamp': '2025-09-15 03:20:14.016299', 'step': 1266, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:14.046386', 'step': 1266, 'epoch': 2} {'type': 'loss', 'content': 0.020589904859662056, 'timestamp': '2025-09-15 03:20:14.048828', 'step': 1267, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:14.078507', 'step': 1267, 'epoch': 2} {'type': 'loss', 'content': 0.02532745711505413, 'timestamp': '2025-09-15 03:20:14.102254', 'step': 1268, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:14.131792', 'step': 1268, 'epoch': 2} {'type': 'loss', 'content': 0.015998464077711105, 'timestamp': '2025-09-15 03:20:14.133763', 'step': 1269, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:14.164978', 'step': 1269, 'epoch': 2} {'type': 'loss', 'content': 0.02568703703582287, 'timestamp': '2025-09-15 03:20:14.167046', 'step': 1270, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:14.196717', 'step': 1270, 'epoch': 2} {'type': 'loss', 'content': 0.006882861256599426, 'timestamp': '2025-09-15 03:20:14.198870', 'step': 1271, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:14.228951', 'step': 1271, 'epoch': 2} {'type': 'loss', 'content': 0.019829755648970604, 'timestamp': '2025-09-15 03:20:14.252841', 'step': 1272, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:14.282880', 'step': 1272, 'epoch': 2} {'type': 'loss', 'content': 0.009075929410755634, 'timestamp': '2025-09-15 03:20:14.284964', 'step': 1273, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:14.315013', 'step': 1273, 'epoch': 2} {'type': 'loss', 'content': 0.021153524518013, 'timestamp': '2025-09-15 03:20:14.317173', 'step': 1274, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:14.347532', 'step': 1274, 'epoch': 2} {'type': 'loss', 'content': 0.00710141658782959, 'timestamp': '2025-09-15 03:20:14.349900', 'step': 1275, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:14.379628', 'step': 1275, 'epoch': 2} {'type': 'loss', 'content': 0.004452737048268318, 'timestamp': '2025-09-15 03:20:14.403116', 'step': 1276, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:14.432916', 'step': 1276, 'epoch': 2} {'type': 'loss', 'content': 0.017650265246629715, 'timestamp': '2025-09-15 03:20:14.435273', 'step': 1277, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:14.465905', 'step': 1277, 'epoch': 2} {'type': 'loss', 'content': 0.011342850513756275, 'timestamp': '2025-09-15 03:20:14.468067', 'step': 1278, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:14.498281', 'step': 1278, 'epoch': 2} {'type': 'loss', 'content': 0.0378478541970253, 'timestamp': '2025-09-15 03:20:14.500405', 'step': 1279, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:14.530022', 'step': 1279, 'epoch': 2} {'type': 'loss', 'content': 0.02093484438955784, 'timestamp': '2025-09-15 03:20:14.553779', 'step': 1280, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:14.584569', 'step': 1280, 'epoch': 2} {'type': 'loss', 'content': 0.008996201679110527, 'timestamp': '2025-09-15 03:20:14.586753', 'step': 1281, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:14.618026', 'step': 1281, 'epoch': 2} {'type': 'loss', 'content': 0.016198158264160156, 'timestamp': '2025-09-15 03:20:14.620304', 'step': 1282, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:14.650828', 'step': 1282, 'epoch': 2} {'type': 'loss', 'content': 0.006912777666002512, 'timestamp': '2025-09-15 03:20:14.652885', 'step': 1283, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:20:14.684063', 'step': 1283, 'epoch': 2} {'type': 'loss', 'content': 0.006836120970547199, 'timestamp': '2025-09-15 03:20:14.707705', 'step': 1284, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:14.738329', 'step': 1284, 'epoch': 2} {'type': 'loss', 'content': 0.030089562758803368, 'timestamp': '2025-09-15 03:20:14.740411', 'step': 1285, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:14.770146', 'step': 1285, 'epoch': 2} {'type': 'loss', 'content': 0.005958269815891981, 'timestamp': '2025-09-15 03:20:14.772294', 'step': 1286, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:14.802219', 'step': 1286, 'epoch': 2} {'type': 'loss', 'content': 0.00943565834313631, 'timestamp': '2025-09-15 03:20:14.804382', 'step': 1287, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:14.833953', 'step': 1287, 'epoch': 2} {'type': 'loss', 'content': 0.004659402649849653, 'timestamp': '2025-09-15 03:20:14.857443', 'step': 1288, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:14.887074', 'step': 1288, 'epoch': 2} {'type': 'loss', 'content': 0.029415473341941833, 'timestamp': '2025-09-15 03:20:14.889341', 'step': 1289, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:14.919235', 'step': 1289, 'epoch': 2} {'type': 'loss', 'content': 0.010652023367583752, 'timestamp': '2025-09-15 03:20:14.921368', 'step': 1290, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:14.951211', 'step': 1290, 'epoch': 2} {'type': 'loss', 'content': 0.017675837501883507, 'timestamp': '2025-09-15 03:20:14.953324', 'step': 1291, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:14.982654', 'step': 1291, 'epoch': 2} {'type': 'loss', 'content': 0.018566444516181946, 'timestamp': '2025-09-15 03:20:15.006423', 'step': 1292, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:15.036542', 'step': 1292, 'epoch': 2} {'type': 'loss', 'content': 0.004875097889453173, 'timestamp': '2025-09-15 03:20:15.038698', 'step': 1293, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:15.068324', 'step': 1293, 'epoch': 2} {'type': 'loss', 'content': 0.004339300561696291, 'timestamp': '2025-09-15 03:20:15.070425', 'step': 1294, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:15.101645', 'step': 1294, 'epoch': 2} {'type': 'loss', 'content': 0.0056786248460412025, 'timestamp': '2025-09-15 03:20:15.103794', 'step': 1295, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:15.134030', 'step': 1295, 'epoch': 2} {'type': 'loss', 'content': 0.023560095578432083, 'timestamp': '2025-09-15 03:20:15.157434', 'step': 1296, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:15.187504', 'step': 1296, 'epoch': 2} {'type': 'loss', 'content': 0.014967325143516064, 'timestamp': '2025-09-15 03:20:15.189652', 'step': 1297, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:15.220167', 'step': 1297, 'epoch': 2} {'type': 'loss', 'content': 0.022020941600203514, 'timestamp': '2025-09-15 03:20:15.222368', 'step': 1298, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:15.252172', 'step': 1298, 'epoch': 2} {'type': 'loss', 'content': 0.019915880635380745, 'timestamp': '2025-09-15 03:20:15.254183', 'step': 1299, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:15.284120', 'step': 1299, 'epoch': 2} {'type': 'loss', 'content': 0.013933622278273106, 'timestamp': '2025-09-15 03:20:15.307468', 'step': 1300, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:15.338153', 'step': 1300, 'epoch': 2} {'type': 'loss', 'content': 0.015517765656113625, 'timestamp': '2025-09-15 03:20:15.340302', 'step': 1301, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:15.370246', 'step': 1301, 'epoch': 2} {'type': 'loss', 'content': 0.009845642372965813, 'timestamp': '2025-09-15 03:20:15.372527', 'step': 1302, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:15.403191', 'step': 1302, 'epoch': 2} {'type': 'loss', 'content': 0.011282161809504032, 'timestamp': '2025-09-15 03:20:15.405198', 'step': 1303, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:15.435349', 'step': 1303, 'epoch': 2} {'type': 'loss', 'content': 0.004566065035760403, 'timestamp': '2025-09-15 03:20:15.458956', 'step': 1304, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:15.489372', 'step': 1304, 'epoch': 2} {'type': 'loss', 'content': 0.012090700678527355, 'timestamp': '2025-09-15 03:20:15.491505', 'step': 1305, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:15.523252', 'step': 1305, 'epoch': 2} {'type': 'loss', 'content': 0.0067639728076756, 'timestamp': '2025-09-15 03:20:15.525430', 'step': 1306, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:15.555625', 'step': 1306, 'epoch': 2} {'type': 'loss', 'content': 0.0149248531088233, 'timestamp': '2025-09-15 03:20:15.557641', 'step': 1307, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:15.587512', 'step': 1307, 'epoch': 2} {'type': 'loss', 'content': 0.015327530913054943, 'timestamp': '2025-09-15 03:20:15.611240', 'step': 1308, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:15.640899', 'step': 1308, 'epoch': 2} {'type': 'loss', 'content': 0.007515019737184048, 'timestamp': '2025-09-15 03:20:15.643245', 'step': 1309, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:15.673234', 'step': 1309, 'epoch': 2} {'type': 'loss', 'content': 0.01602526567876339, 'timestamp': '2025-09-15 03:20:15.675673', 'step': 1310, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:20:15.706729', 'step': 1310, 'epoch': 2} {'type': 'loss', 'content': 0.021097447723150253, 'timestamp': '2025-09-15 03:20:15.709472', 'step': 1311, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:20:16.417438', 'step': 1311, 'epoch': 2} {'type': 'pplx', 'content': 76183460.07738191, 'timestamp': '2025-09-15 03:20:16.419855', 'step': 1311, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:16.449079', 'step': 1311, 'epoch': 2} {'type': 'loss', 'content': 0.003573415335267782, 'timestamp': '2025-09-15 03:20:16.472625', 'step': 1312, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:16.502606', 'step': 1312, 'epoch': 2} {'type': 'loss', 'content': 0.005960460286587477, 'timestamp': '2025-09-15 03:20:16.504818', 'step': 1313, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:16.535033', 'step': 1313, 'epoch': 2} {'type': 'loss', 'content': 0.0011057720985263586, 'timestamp': '2025-09-15 03:20:16.537015', 'step': 1314, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:16.567323', 'step': 1314, 'epoch': 2} {'type': 'loss', 'content': 0.012723295949399471, 'timestamp': '2025-09-15 03:20:16.569501', 'step': 1315, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:16.599734', 'step': 1315, 'epoch': 2} {'type': 'loss', 'content': 0.0056397877633571625, 'timestamp': '2025-09-15 03:20:16.623539', 'step': 1316, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:16.653233', 'step': 1316, 'epoch': 2} {'type': 'loss', 'content': 0.01716693863272667, 'timestamp': '2025-09-15 03:20:16.655336', 'step': 1317, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:16.685136', 'step': 1317, 'epoch': 2} {'type': 'loss', 'content': 0.002221998292952776, 'timestamp': '2025-09-15 03:20:16.687158', 'step': 1318, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:16.716713', 'step': 1318, 'epoch': 2} {'type': 'loss', 'content': 0.02662539668381214, 'timestamp': '2025-09-15 03:20:16.719082', 'step': 1319, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:16.748506', 'step': 1319, 'epoch': 2} {'type': 'loss', 'content': 0.006457353942096233, 'timestamp': '2025-09-15 03:20:16.772164', 'step': 1320, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:16.801557', 'step': 1320, 'epoch': 2} {'type': 'loss', 'content': 0.04024200886487961, 'timestamp': '2025-09-15 03:20:16.803679', 'step': 1321, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:16.833915', 'step': 1321, 'epoch': 2} {'type': 'loss', 'content': 0.02188965305685997, 'timestamp': '2025-09-15 03:20:16.836081', 'step': 1322, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:16.866032', 'step': 1322, 'epoch': 2} {'type': 'loss', 'content': 0.010733796283602715, 'timestamp': '2025-09-15 03:20:16.868518', 'step': 1323, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:16.899945', 'step': 1323, 'epoch': 2} {'type': 'loss', 'content': 0.00566983362659812, 'timestamp': '2025-09-15 03:20:16.924177', 'step': 1324, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:16.954756', 'step': 1324, 'epoch': 2} {'type': 'loss', 'content': 0.00432900246232748, 'timestamp': '2025-09-15 03:20:16.957317', 'step': 1325, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:16.987368', 'step': 1325, 'epoch': 2} {'type': 'loss', 'content': 0.006719162221997976, 'timestamp': '2025-09-15 03:20:16.989682', 'step': 1326, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:17.020100', 'step': 1326, 'epoch': 2} {'type': 'loss', 'content': 0.005306191276758909, 'timestamp': '2025-09-15 03:20:17.022133', 'step': 1327, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:17.052089', 'step': 1327, 'epoch': 2} {'type': 'loss', 'content': 0.008937738835811615, 'timestamp': '2025-09-15 03:20:17.075674', 'step': 1328, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:17.106078', 'step': 1328, 'epoch': 2} {'type': 'loss', 'content': 0.0164463073015213, 'timestamp': '2025-09-15 03:20:17.108150', 'step': 1329, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:17.137462', 'step': 1329, 'epoch': 2} {'type': 'loss', 'content': 0.005517881829291582, 'timestamp': '2025-09-15 03:20:17.139502', 'step': 1330, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:17.169019', 'step': 1330, 'epoch': 2} {'type': 'loss', 'content': 0.041902415454387665, 'timestamp': '2025-09-15 03:20:17.171133', 'step': 1331, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:17.200749', 'step': 1331, 'epoch': 2} {'type': 'loss', 'content': 0.01374905463308096, 'timestamp': '2025-09-15 03:20:17.224404', 'step': 1332, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:17.254785', 'step': 1332, 'epoch': 2} {'type': 'loss', 'content': 0.00975362304598093, 'timestamp': '2025-09-15 03:20:17.256854', 'step': 1333, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:17.286711', 'step': 1333, 'epoch': 2} {'type': 'loss', 'content': 0.013099453411996365, 'timestamp': '2025-09-15 03:20:17.288810', 'step': 1334, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:17.318725', 'step': 1334, 'epoch': 2} {'type': 'loss', 'content': 0.021935611963272095, 'timestamp': '2025-09-15 03:20:17.320810', 'step': 1335, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:17.350834', 'step': 1335, 'epoch': 2} {'type': 'loss', 'content': 0.00626968452706933, 'timestamp': '2025-09-15 03:20:17.374473', 'step': 1336, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:17.406392', 'step': 1336, 'epoch': 2} {'type': 'loss', 'content': 0.009117616340517998, 'timestamp': '2025-09-15 03:20:17.408597', 'step': 1337, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:17.438098', 'step': 1337, 'epoch': 2} {'type': 'loss', 'content': 0.015381724573671818, 'timestamp': '2025-09-15 03:20:17.440302', 'step': 1338, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:17.469968', 'step': 1338, 'epoch': 2} {'type': 'loss', 'content': 0.003806751687079668, 'timestamp': '2025-09-15 03:20:17.472070', 'step': 1339, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:17.502164', 'step': 1339, 'epoch': 2} {'type': 'loss', 'content': 0.013369276188313961, 'timestamp': '2025-09-15 03:20:17.525683', 'step': 1340, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:17.556028', 'step': 1340, 'epoch': 2} {'type': 'loss', 'content': 0.020524680614471436, 'timestamp': '2025-09-15 03:20:17.558096', 'step': 1341, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:17.588263', 'step': 1341, 'epoch': 2} {'type': 'loss', 'content': 0.01145204808562994, 'timestamp': '2025-09-15 03:20:17.590499', 'step': 1342, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:17.619962', 'step': 1342, 'epoch': 2} {'type': 'loss', 'content': 0.010058576241135597, 'timestamp': '2025-09-15 03:20:17.622051', 'step': 1343, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:17.652033', 'step': 1343, 'epoch': 2} {'type': 'loss', 'content': 0.013884322717785835, 'timestamp': '2025-09-15 03:20:17.675582', 'step': 1344, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:17.705407', 'step': 1344, 'epoch': 2} {'type': 'loss', 'content': 0.0023034908808767796, 'timestamp': '2025-09-15 03:20:17.707744', 'step': 1345, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:17.737784', 'step': 1345, 'epoch': 2} {'type': 'loss', 'content': 0.014699487946927547, 'timestamp': '2025-09-15 03:20:17.739803', 'step': 1346, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:17.769943', 'step': 1346, 'epoch': 2} {'type': 'loss', 'content': 0.013409526087343693, 'timestamp': '2025-09-15 03:20:17.772254', 'step': 1347, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:17.805393', 'step': 1347, 'epoch': 2} {'type': 'loss', 'content': 0.00270974007435143, 'timestamp': '2025-09-15 03:20:17.828877', 'step': 1348, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:17.864618', 'step': 1348, 'epoch': 2} {'type': 'loss', 'content': 0.002560506807640195, 'timestamp': '2025-09-15 03:20:17.866751', 'step': 1349, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:17.896879', 'step': 1349, 'epoch': 2} {'type': 'loss', 'content': 0.0075851972214877605, 'timestamp': '2025-09-15 03:20:17.899159', 'step': 1350, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:20:17.928813', 'step': 1350, 'epoch': 2} {'type': 'loss', 'content': 0.008130542933940887, 'timestamp': '2025-09-15 03:20:17.931417', 'step': 1351, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:17.961170', 'step': 1351, 'epoch': 2} {'type': 'loss', 'content': 0.0021126719657331705, 'timestamp': '2025-09-15 03:20:17.984810', 'step': 1352, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:18.015225', 'step': 1352, 'epoch': 2} {'type': 'loss', 'content': 0.004403482656925917, 'timestamp': '2025-09-15 03:20:18.017240', 'step': 1353, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:18.047055', 'step': 1353, 'epoch': 2} {'type': 'loss', 'content': 0.018507298082113266, 'timestamp': '2025-09-15 03:20:18.049010', 'step': 1354, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:18.078714', 'step': 1354, 'epoch': 2} {'type': 'loss', 'content': 0.019040079787373543, 'timestamp': '2025-09-15 03:20:18.080781', 'step': 1355, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:18.110488', 'step': 1355, 'epoch': 2} {'type': 'loss', 'content': 0.027470026165246964, 'timestamp': '2025-09-15 03:20:18.134088', 'step': 1356, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:18.163629', 'step': 1356, 'epoch': 2} {'type': 'loss', 'content': 0.007900933735072613, 'timestamp': '2025-09-15 03:20:18.165588', 'step': 1357, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:18.195120', 'step': 1357, 'epoch': 2} {'type': 'loss', 'content': 0.007872597314417362, 'timestamp': '2025-09-15 03:20:18.197346', 'step': 1358, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:18.227225', 'step': 1358, 'epoch': 2} {'type': 'loss', 'content': 0.03232871741056442, 'timestamp': '2025-09-15 03:20:18.229301', 'step': 1359, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:18.258742', 'step': 1359, 'epoch': 2} {'type': 'loss', 'content': 0.0032297975849360228, 'timestamp': '2025-09-15 03:20:18.282326', 'step': 1360, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:18.312465', 'step': 1360, 'epoch': 2} {'type': 'loss', 'content': 0.03657596930861473, 'timestamp': '2025-09-15 03:20:18.314889', 'step': 1361, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:18.344739', 'step': 1361, 'epoch': 2} {'type': 'loss', 'content': 0.01493586041033268, 'timestamp': '2025-09-15 03:20:18.347044', 'step': 1362, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:18.376829', 'step': 1362, 'epoch': 2} {'type': 'loss', 'content': 0.005877330899238586, 'timestamp': '2025-09-15 03:20:18.378935', 'step': 1363, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:18.408795', 'step': 1363, 'epoch': 2} {'type': 'loss', 'content': 0.00430047744885087, 'timestamp': '2025-09-15 03:20:18.432458', 'step': 1364, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:18.479134', 'step': 1364, 'epoch': 2} {'type': 'loss', 'content': 0.015451534651219845, 'timestamp': '2025-09-15 03:20:18.481520', 'step': 1365, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:18.512230', 'step': 1365, 'epoch': 2} {'type': 'loss', 'content': 0.00832511205226183, 'timestamp': '2025-09-15 03:20:18.514429', 'step': 1366, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:18.546481', 'step': 1366, 'epoch': 2} {'type': 'loss', 'content': 0.012594794854521751, 'timestamp': '2025-09-15 03:20:18.548572', 'step': 1367, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:18.579123', 'step': 1367, 'epoch': 2} {'type': 'loss', 'content': 0.010724768042564392, 'timestamp': '2025-09-15 03:20:18.602799', 'step': 1368, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:20:19.313696', 'step': 1368, 'epoch': 2} {'type': 'pplx', 'content': 78061737.8925938, 'timestamp': '2025-09-15 03:20:19.315841', 'step': 1368, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:19.344404', 'step': 1368, 'epoch': 2} {'type': 'loss', 'content': 0.0019664892461150885, 'timestamp': '2025-09-15 03:20:19.346500', 'step': 1369, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:19.376283', 'step': 1369, 'epoch': 2} {'type': 'loss', 'content': 0.006835191044956446, 'timestamp': '2025-09-15 03:20:19.378404', 'step': 1370, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:20:19.408567', 'step': 1370, 'epoch': 2} {'type': 'loss', 'content': 0.007734772749245167, 'timestamp': '2025-09-15 03:20:19.411233', 'step': 1371, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:19.441162', 'step': 1371, 'epoch': 2} {'type': 'loss', 'content': 0.006751236040145159, 'timestamp': '2025-09-15 03:20:19.464757', 'step': 1372, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:19.494504', 'step': 1372, 'epoch': 2} {'type': 'loss', 'content': 0.013009068556129932, 'timestamp': '2025-09-15 03:20:19.496913', 'step': 1373, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:19.526972', 'step': 1373, 'epoch': 2} {'type': 'loss', 'content': 0.02262653224170208, 'timestamp': '2025-09-15 03:20:19.529051', 'step': 1374, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:19.559128', 'step': 1374, 'epoch': 2} {'type': 'loss', 'content': 0.02342197671532631, 'timestamp': '2025-09-15 03:20:19.561402', 'step': 1375, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:19.591457', 'step': 1375, 'epoch': 2} {'type': 'loss', 'content': 0.008266033604741096, 'timestamp': '2025-09-15 03:20:19.615078', 'step': 1376, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:19.645058', 'step': 1376, 'epoch': 2} {'type': 'loss', 'content': 0.0218669343739748, 'timestamp': '2025-09-15 03:20:19.647391', 'step': 1377, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:19.677539', 'step': 1377, 'epoch': 2} {'type': 'loss', 'content': 0.009586125612258911, 'timestamp': '2025-09-15 03:20:19.679962', 'step': 1378, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:19.710403', 'step': 1378, 'epoch': 2} {'type': 'loss', 'content': 0.01921851374208927, 'timestamp': '2025-09-15 03:20:19.712736', 'step': 1379, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:19.743105', 'step': 1379, 'epoch': 2} {'type': 'loss', 'content': 0.003876746166497469, 'timestamp': '2025-09-15 03:20:19.766546', 'step': 1380, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:19.796199', 'step': 1380, 'epoch': 2} {'type': 'loss', 'content': 0.02817346155643463, 'timestamp': '2025-09-15 03:20:19.798283', 'step': 1381, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:19.827830', 'step': 1381, 'epoch': 2} {'type': 'loss', 'content': 0.012659131549298763, 'timestamp': '2025-09-15 03:20:19.829820', 'step': 1382, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:19.859941', 'step': 1382, 'epoch': 2} {'type': 'loss', 'content': 0.011585528962314129, 'timestamp': '2025-09-15 03:20:19.861996', 'step': 1383, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:19.891722', 'step': 1383, 'epoch': 2} {'type': 'loss', 'content': 0.0018231567228212953, 'timestamp': '2025-09-15 03:20:19.915232', 'step': 1384, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:19.945415', 'step': 1384, 'epoch': 2} {'type': 'loss', 'content': 0.020531652495265007, 'timestamp': '2025-09-15 03:20:19.947551', 'step': 1385, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:19.976757', 'step': 1385, 'epoch': 2} {'type': 'loss', 'content': 0.016559377312660217, 'timestamp': '2025-09-15 03:20:19.978808', 'step': 1386, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:20.008324', 'step': 1386, 'epoch': 2} {'type': 'loss', 'content': 0.028371769934892654, 'timestamp': '2025-09-15 03:20:20.011300', 'step': 1387, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:20.040999', 'step': 1387, 'epoch': 2} {'type': 'loss', 'content': 0.0019516665488481522, 'timestamp': '2025-09-15 03:20:20.064504', 'step': 1388, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:20.094352', 'step': 1388, 'epoch': 2} {'type': 'loss', 'content': 0.01035268884152174, 'timestamp': '2025-09-15 03:20:20.096426', 'step': 1389, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:20.127438', 'step': 1389, 'epoch': 2} {'type': 'loss', 'content': 0.0055434927344322205, 'timestamp': '2025-09-15 03:20:20.129590', 'step': 1390, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:20.160074', 'step': 1390, 'epoch': 2} {'type': 'loss', 'content': 0.008157492615282536, 'timestamp': '2025-09-15 03:20:20.162591', 'step': 1391, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:20.192576', 'step': 1391, 'epoch': 2} {'type': 'loss', 'content': 0.006338280625641346, 'timestamp': '2025-09-15 03:20:20.216372', 'step': 1392, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:20.246324', 'step': 1392, 'epoch': 2} {'type': 'loss', 'content': 0.014807418920099735, 'timestamp': '2025-09-15 03:20:20.248643', 'step': 1393, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:20.278975', 'step': 1393, 'epoch': 2} {'type': 'loss', 'content': 0.014586917124688625, 'timestamp': '2025-09-15 03:20:20.280978', 'step': 1394, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:20.310495', 'step': 1394, 'epoch': 2} {'type': 'loss', 'content': 0.005534280091524124, 'timestamp': '2025-09-15 03:20:20.312633', 'step': 1395, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:20.343524', 'step': 1395, 'epoch': 2} {'type': 'loss', 'content': 0.008956083096563816, 'timestamp': '2025-09-15 03:20:20.367164', 'step': 1396, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:20.396889', 'step': 1396, 'epoch': 2} {'type': 'loss', 'content': 0.016665924340486526, 'timestamp': '2025-09-15 03:20:20.399309', 'step': 1397, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:20.430126', 'step': 1397, 'epoch': 2} {'type': 'loss', 'content': 0.03966280072927475, 'timestamp': '2025-09-15 03:20:20.431935', 'step': 1398, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:20.462178', 'step': 1398, 'epoch': 2} {'type': 'loss', 'content': 0.0075687081553041935, 'timestamp': '2025-09-15 03:20:20.464648', 'step': 1399, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:20.496864', 'step': 1399, 'epoch': 2} {'type': 'loss', 'content': 0.006370862480252981, 'timestamp': '2025-09-15 03:20:20.521766', 'step': 1400, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:20.553858', 'step': 1400, 'epoch': 2} {'type': 'loss', 'content': 0.005345536861568689, 'timestamp': '2025-09-15 03:20:20.566804', 'step': 1401, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:20.606600', 'step': 1401, 'epoch': 2} {'type': 'loss', 'content': 0.007125379052013159, 'timestamp': '2025-09-15 03:20:20.608755', 'step': 1402, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:20.646623', 'step': 1402, 'epoch': 2} {'type': 'loss', 'content': 0.007548754569143057, 'timestamp': '2025-09-15 03:20:20.648686', 'step': 1403, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:20.683741', 'step': 1403, 'epoch': 2} {'type': 'loss', 'content': 0.027229715138673782, 'timestamp': '2025-09-15 03:20:20.707863', 'step': 1404, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:20.742049', 'step': 1404, 'epoch': 2} {'type': 'loss', 'content': 0.014742210507392883, 'timestamp': '2025-09-15 03:20:20.744183', 'step': 1405, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:20.774779', 'step': 1405, 'epoch': 2} {'type': 'loss', 'content': 0.021080315113067627, 'timestamp': '2025-09-15 03:20:20.777052', 'step': 1406, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:20.807418', 'step': 1406, 'epoch': 2} {'type': 'loss', 'content': 0.0020817893091589212, 'timestamp': '2025-09-15 03:20:20.810531', 'step': 1407, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:20.840338', 'step': 1407, 'epoch': 2} {'type': 'loss', 'content': 0.017813878133893013, 'timestamp': '2025-09-15 03:20:20.863928', 'step': 1408, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:20.896500', 'step': 1408, 'epoch': 2} {'type': 'loss', 'content': 0.018892155960202217, 'timestamp': '2025-09-15 03:20:20.898656', 'step': 1409, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:20.929420', 'step': 1409, 'epoch': 2} {'type': 'loss', 'content': 0.0018157056765630841, 'timestamp': '2025-09-15 03:20:20.931340', 'step': 1410, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:20.960709', 'step': 1410, 'epoch': 2} {'type': 'loss', 'content': 0.01594122126698494, 'timestamp': '2025-09-15 03:20:20.962774', 'step': 1411, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:21.008829', 'step': 1411, 'epoch': 2} {'type': 'loss', 'content': 0.003443245543166995, 'timestamp': '2025-09-15 03:20:21.032477', 'step': 1412, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:21.062872', 'step': 1412, 'epoch': 2} {'type': 'loss', 'content': 0.0034602934028953314, 'timestamp': '2025-09-15 03:20:21.065110', 'step': 1413, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:21.095084', 'step': 1413, 'epoch': 2} {'type': 'loss', 'content': 0.0011383292730897665, 'timestamp': '2025-09-15 03:20:21.097240', 'step': 1414, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:21.131005', 'step': 1414, 'epoch': 2} {'type': 'loss', 'content': 0.015220055356621742, 'timestamp': '2025-09-15 03:20:21.133169', 'step': 1415, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:21.168698', 'step': 1415, 'epoch': 2} {'type': 'loss', 'content': 0.007584023289382458, 'timestamp': '2025-09-15 03:20:21.194194', 'step': 1416, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:21.223987', 'step': 1416, 'epoch': 2} {'type': 'loss', 'content': 0.001142369001172483, 'timestamp': '2025-09-15 03:20:21.226997', 'step': 1417, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:21.256503', 'step': 1417, 'epoch': 2} {'type': 'loss', 'content': 0.002171542961150408, 'timestamp': '2025-09-15 03:20:21.258623', 'step': 1418, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:21.289055', 'step': 1418, 'epoch': 2} {'type': 'loss', 'content': 0.003867406165227294, 'timestamp': '2025-09-15 03:20:21.291049', 'step': 1419, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:21.320831', 'step': 1419, 'epoch': 2} {'type': 'loss', 'content': 0.005698856431990862, 'timestamp': '2025-09-15 03:20:21.344290', 'step': 1420, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:21.374493', 'step': 1420, 'epoch': 2} {'type': 'loss', 'content': 0.00942178163677454, 'timestamp': '2025-09-15 03:20:21.376639', 'step': 1421, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:21.407065', 'step': 1421, 'epoch': 2} {'type': 'loss', 'content': 0.0010488334810361266, 'timestamp': '2025-09-15 03:20:21.409201', 'step': 1422, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:21.438697', 'step': 1422, 'epoch': 2} {'type': 'loss', 'content': 0.04225903004407883, 'timestamp': '2025-09-15 03:20:21.440742', 'step': 1423, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:21.470909', 'step': 1423, 'epoch': 2} {'type': 'loss', 'content': 0.023032980039715767, 'timestamp': '2025-09-15 03:20:21.494406', 'step': 1424, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:21.524524', 'step': 1424, 'epoch': 2} {'type': 'loss', 'content': 0.027132472023367882, 'timestamp': '2025-09-15 03:20:21.526603', 'step': 1425, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:20:22.242142', 'step': 1425, 'epoch': 2} {'type': 'pplx', 'content': 91832825.88411044, 'timestamp': '2025-09-15 03:20:22.244081', 'step': 1425, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:22.273362', 'step': 1425, 'epoch': 2} {'type': 'loss', 'content': 0.0015831494238227606, 'timestamp': '2025-09-15 03:20:22.275378', 'step': 1426, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:22.305335', 'step': 1426, 'epoch': 2} {'type': 'loss', 'content': 0.004346030298620462, 'timestamp': '2025-09-15 03:20:22.307413', 'step': 1427, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:22.337493', 'step': 1427, 'epoch': 2} {'type': 'loss', 'content': 0.03036068007349968, 'timestamp': '2025-09-15 03:20:22.362426', 'step': 1428, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:22.393852', 'step': 1428, 'epoch': 2} {'type': 'loss', 'content': 0.004273244645446539, 'timestamp': '2025-09-15 03:20:22.395949', 'step': 1429, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:22.426650', 'step': 1429, 'epoch': 2} {'type': 'loss', 'content': 0.005714810453355312, 'timestamp': '2025-09-15 03:20:22.428749', 'step': 1430, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:22.458910', 'step': 1430, 'epoch': 2} {'type': 'loss', 'content': 0.01159138698130846, 'timestamp': '2025-09-15 03:20:22.460831', 'step': 1431, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:22.492127', 'step': 1431, 'epoch': 2} {'type': 'loss', 'content': 0.007387206889688969, 'timestamp': '2025-09-15 03:20:22.515783', 'step': 1432, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:22.545814', 'step': 1432, 'epoch': 2} {'type': 'loss', 'content': 0.033014681190252304, 'timestamp': '2025-09-15 03:20:22.548005', 'step': 1433, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:22.577841', 'step': 1433, 'epoch': 2} {'type': 'loss', 'content': 0.015044102445244789, 'timestamp': '2025-09-15 03:20:22.580007', 'step': 1434, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:22.609580', 'step': 1434, 'epoch': 2} {'type': 'loss', 'content': 0.008882693946361542, 'timestamp': '2025-09-15 03:20:22.611593', 'step': 1435, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:22.641333', 'step': 1435, 'epoch': 2} {'type': 'loss', 'content': 0.003103381721302867, 'timestamp': '2025-09-15 03:20:22.665837', 'step': 1436, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:22.697401', 'step': 1436, 'epoch': 2} {'type': 'loss', 'content': 0.020440733060240746, 'timestamp': '2025-09-15 03:20:22.699629', 'step': 1437, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:22.731328', 'step': 1437, 'epoch': 2} {'type': 'loss', 'content': 0.007218391168862581, 'timestamp': '2025-09-15 03:20:22.734611', 'step': 1438, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:22.767966', 'step': 1438, 'epoch': 2} {'type': 'loss', 'content': 0.0050024488009512424, 'timestamp': '2025-09-15 03:20:22.770042', 'step': 1439, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:22.808501', 'step': 1439, 'epoch': 2} {'type': 'loss', 'content': 0.0015697681810706854, 'timestamp': '2025-09-15 03:20:22.832531', 'step': 1440, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:22.863319', 'step': 1440, 'epoch': 2} {'type': 'loss', 'content': 0.002475773449987173, 'timestamp': '2025-09-15 03:20:22.865304', 'step': 1441, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:22.895421', 'step': 1441, 'epoch': 2} {'type': 'loss', 'content': 0.002379846991971135, 'timestamp': '2025-09-15 03:20:22.897495', 'step': 1442, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:22.927810', 'step': 1442, 'epoch': 2} {'type': 'loss', 'content': 0.01347306091338396, 'timestamp': '2025-09-15 03:20:22.932120', 'step': 1443, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:22.964590', 'step': 1443, 'epoch': 2} {'type': 'loss', 'content': 0.0012131101684644818, 'timestamp': '2025-09-15 03:20:22.987998', 'step': 1444, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:23.017925', 'step': 1444, 'epoch': 2} {'type': 'loss', 'content': 0.0056666177697479725, 'timestamp': '2025-09-15 03:20:23.020669', 'step': 1445, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:23.051241', 'step': 1445, 'epoch': 2} {'type': 'loss', 'content': 0.006143883336335421, 'timestamp': '2025-09-15 03:20:23.053318', 'step': 1446, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:23.085681', 'step': 1446, 'epoch': 2} {'type': 'loss', 'content': 0.007649295497685671, 'timestamp': '2025-09-15 03:20:23.088064', 'step': 1447, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:23.118314', 'step': 1447, 'epoch': 2} {'type': 'loss', 'content': 0.007761591114103794, 'timestamp': '2025-09-15 03:20:23.144982', 'step': 1448, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:23.184242', 'step': 1448, 'epoch': 2} {'type': 'loss', 'content': 0.0191853828728199, 'timestamp': '2025-09-15 03:20:23.186363', 'step': 1449, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:23.216828', 'step': 1449, 'epoch': 2} {'type': 'loss', 'content': 0.0046918983571231365, 'timestamp': '2025-09-15 03:20:23.221198', 'step': 1450, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:23.253773', 'step': 1450, 'epoch': 2} {'type': 'loss', 'content': 0.013605318032205105, 'timestamp': '2025-09-15 03:20:23.259672', 'step': 1451, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:23.289779', 'step': 1451, 'epoch': 2} {'type': 'loss', 'content': 0.002763927448540926, 'timestamp': '2025-09-15 03:20:23.313174', 'step': 1452, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:23.342863', 'step': 1452, 'epoch': 2} {'type': 'loss', 'content': 0.0015863854205235839, 'timestamp': '2025-09-15 03:20:23.344942', 'step': 1453, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:23.374821', 'step': 1453, 'epoch': 2} {'type': 'loss', 'content': 0.013510088436305523, 'timestamp': '2025-09-15 03:20:23.377065', 'step': 1454, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:23.407701', 'step': 1454, 'epoch': 2} {'type': 'loss', 'content': 0.01751178875565529, 'timestamp': '2025-09-15 03:20:23.409709', 'step': 1455, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:23.439914', 'step': 1455, 'epoch': 2} {'type': 'loss', 'content': 0.019616883248090744, 'timestamp': '2025-09-15 03:20:23.463336', 'step': 1456, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:23.494310', 'step': 1456, 'epoch': 2} {'type': 'loss', 'content': 0.004575754515826702, 'timestamp': '2025-09-15 03:20:23.496146', 'step': 1457, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:23.525778', 'step': 1457, 'epoch': 2} {'type': 'loss', 'content': 0.019520703703165054, 'timestamp': '2025-09-15 03:20:23.527844', 'step': 1458, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:23.557502', 'step': 1458, 'epoch': 2} {'type': 'loss', 'content': 0.017982807010412216, 'timestamp': '2025-09-15 03:20:23.559498', 'step': 1459, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:23.589743', 'step': 1459, 'epoch': 2} {'type': 'loss', 'content': 0.0008443990955129266, 'timestamp': '2025-09-15 03:20:23.613144', 'step': 1460, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:23.643377', 'step': 1460, 'epoch': 2} {'type': 'loss', 'content': 0.05071615055203438, 'timestamp': '2025-09-15 03:20:23.645733', 'step': 1461, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:23.676146', 'step': 1461, 'epoch': 2} {'type': 'loss', 'content': 0.011467128060758114, 'timestamp': '2025-09-15 03:20:23.678560', 'step': 1462, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:23.709362', 'step': 1462, 'epoch': 2} {'type': 'loss', 'content': 0.022799739614129066, 'timestamp': '2025-09-15 03:20:23.711584', 'step': 1463, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:23.740840', 'step': 1463, 'epoch': 2} {'type': 'loss', 'content': 0.025861812755465508, 'timestamp': '2025-09-15 03:20:23.764363', 'step': 1464, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:23.794537', 'step': 1464, 'epoch': 2} {'type': 'loss', 'content': 0.008623288944363594, 'timestamp': '2025-09-15 03:20:23.796559', 'step': 1465, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:23.826650', 'step': 1465, 'epoch': 2} {'type': 'loss', 'content': 0.02509269118309021, 'timestamp': '2025-09-15 03:20:23.830355', 'step': 1466, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:23.861670', 'step': 1466, 'epoch': 2} {'type': 'loss', 'content': 0.03199907764792442, 'timestamp': '2025-09-15 03:20:23.863635', 'step': 1467, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:23.893808', 'step': 1467, 'epoch': 2} {'type': 'loss', 'content': 0.01597598008811474, 'timestamp': '2025-09-15 03:20:23.917308', 'step': 1468, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:23.948214', 'step': 1468, 'epoch': 2} {'type': 'loss', 'content': 0.013961074873805046, 'timestamp': '2025-09-15 03:20:23.950123', 'step': 1469, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:23.980236', 'step': 1469, 'epoch': 2} {'type': 'loss', 'content': 0.042097508907318115, 'timestamp': '2025-09-15 03:20:23.982335', 'step': 1470, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:24.012732', 'step': 1470, 'epoch': 2} {'type': 'loss', 'content': 0.018110528588294983, 'timestamp': '2025-09-15 03:20:24.014959', 'step': 1471, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:24.044851', 'step': 1471, 'epoch': 2} {'type': 'loss', 'content': 0.00701131671667099, 'timestamp': '2025-09-15 03:20:24.068196', 'step': 1472, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:24.098245', 'step': 1472, 'epoch': 2} {'type': 'loss', 'content': 0.006260588299483061, 'timestamp': '2025-09-15 03:20:24.099899', 'step': 1473, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:24.130081', 'step': 1473, 'epoch': 2} {'type': 'loss', 'content': 0.009581341408193111, 'timestamp': '2025-09-15 03:20:24.132175', 'step': 1474, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:24.162336', 'step': 1474, 'epoch': 2} {'type': 'loss', 'content': 0.013548241928219795, 'timestamp': '2025-09-15 03:20:24.164334', 'step': 1475, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:24.194673', 'step': 1475, 'epoch': 2} {'type': 'loss', 'content': 0.01859314739704132, 'timestamp': '2025-09-15 03:20:24.218311', 'step': 1476, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:24.248929', 'step': 1476, 'epoch': 2} {'type': 'loss', 'content': 0.002538888482376933, 'timestamp': '2025-09-15 03:20:24.251174', 'step': 1477, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:24.280996', 'step': 1477, 'epoch': 2} {'type': 'loss', 'content': 0.012346216477453709, 'timestamp': '2025-09-15 03:20:24.283084', 'step': 1478, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:24.313323', 'step': 1478, 'epoch': 2} {'type': 'loss', 'content': 0.013880938291549683, 'timestamp': '2025-09-15 03:20:24.315322', 'step': 1479, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:24.345084', 'step': 1479, 'epoch': 2} {'type': 'loss', 'content': 0.00909118540585041, 'timestamp': '2025-09-15 03:20:24.368558', 'step': 1480, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:24.398625', 'step': 1480, 'epoch': 2} {'type': 'loss', 'content': 0.015353651717305183, 'timestamp': '2025-09-15 03:20:24.400759', 'step': 1481, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:24.431330', 'step': 1481, 'epoch': 2} {'type': 'loss', 'content': 0.01462968997657299, 'timestamp': '2025-09-15 03:20:24.433473', 'step': 1482, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:20:25.140066', 'step': 1482, 'epoch': 2} {'type': 'pplx', 'content': 94876692.11559597, 'timestamp': '2025-09-15 03:20:25.142058', 'step': 1482, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:25.171504', 'step': 1482, 'epoch': 2} {'type': 'loss', 'content': 0.024286191910505295, 'timestamp': '2025-09-15 03:20:25.173551', 'step': 1483, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:25.204743', 'step': 1483, 'epoch': 2} {'type': 'loss', 'content': 0.0050559681840240955, 'timestamp': '2025-09-15 03:20:25.228242', 'step': 1484, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:20:25.258220', 'step': 1484, 'epoch': 2} {'type': 'loss', 'content': 0.02460642158985138, 'timestamp': '2025-09-15 03:20:25.260141', 'step': 1485, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:25.290321', 'step': 1485, 'epoch': 2} {'type': 'loss', 'content': 0.013014466501772404, 'timestamp': '2025-09-15 03:20:25.292458', 'step': 1486, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:25.321826', 'step': 1486, 'epoch': 2} {'type': 'loss', 'content': 0.004653667565435171, 'timestamp': '2025-09-15 03:20:25.323964', 'step': 1487, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:25.355568', 'step': 1487, 'epoch': 2} {'type': 'loss', 'content': 0.006579217966645956, 'timestamp': '2025-09-15 03:20:25.379293', 'step': 1488, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:25.409125', 'step': 1488, 'epoch': 2} {'type': 'loss', 'content': 0.01029092725366354, 'timestamp': '2025-09-15 03:20:25.411476', 'step': 1489, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:25.443252', 'step': 1489, 'epoch': 2} {'type': 'loss', 'content': 0.017754560336470604, 'timestamp': '2025-09-15 03:20:25.445379', 'step': 1490, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:25.475341', 'step': 1490, 'epoch': 2} {'type': 'loss', 'content': 0.014583499170839787, 'timestamp': '2025-09-15 03:20:25.477270', 'step': 1491, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:25.507864', 'step': 1491, 'epoch': 2} {'type': 'loss', 'content': 0.010781803168356419, 'timestamp': '2025-09-15 03:20:25.531483', 'step': 1492, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:25.561278', 'step': 1492, 'epoch': 2} {'type': 'loss', 'content': 0.006159218959510326, 'timestamp': '2025-09-15 03:20:25.563663', 'step': 1493, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:25.595076', 'step': 1493, 'epoch': 2} {'type': 'loss', 'content': 0.013684733770787716, 'timestamp': '2025-09-15 03:20:25.597079', 'step': 1494, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:20:25.626828', 'step': 1494, 'epoch': 2} {'type': 'loss', 'content': 0.041964296251535416, 'timestamp': '2025-09-15 03:20:25.629397', 'step': 1495, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:25.659260', 'step': 1495, 'epoch': 2} {'type': 'loss', 'content': 0.0167537909001112, 'timestamp': '2025-09-15 03:20:25.682683', 'step': 1496, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:25.712965', 'step': 1496, 'epoch': 2} {'type': 'loss', 'content': 0.005579576827585697, 'timestamp': '2025-09-15 03:20:25.715181', 'step': 1497, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:25.745062', 'step': 1497, 'epoch': 2} {'type': 'loss', 'content': 0.007810843177139759, 'timestamp': '2025-09-15 03:20:25.747280', 'step': 1498, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:25.777415', 'step': 1498, 'epoch': 2} {'type': 'loss', 'content': 0.011461307294666767, 'timestamp': '2025-09-15 03:20:25.779512', 'step': 1499, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:25.809270', 'step': 1499, 'epoch': 2} {'type': 'loss', 'content': 0.019440533593297005, 'timestamp': '2025-09-15 03:20:25.833162', 'step': 1500, 'epoch': 2} {'type': 'info', 'content': 'Checkpoint saved at step 1500', 'timestamp': '2025-09-15 03:20:32.277364', 'step': 1500, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:32.321382', 'step': 1500, 'epoch': 2} {'type': 'loss', 'content': 0.00861346535384655, 'timestamp': '2025-09-15 03:20:32.323847', 'step': 1501, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:32.355255', 'step': 1501, 'epoch': 2} {'type': 'loss', 'content': 0.00922238826751709, 'timestamp': '2025-09-15 03:20:32.357272', 'step': 1502, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:32.387424', 'step': 1502, 'epoch': 2} {'type': 'loss', 'content': 0.004329313989728689, 'timestamp': '2025-09-15 03:20:32.389466', 'step': 1503, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:32.420808', 'step': 1503, 'epoch': 2} {'type': 'loss', 'content': 0.008352418430149555, 'timestamp': '2025-09-15 03:20:32.444039', 'step': 1504, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:32.474059', 'step': 1504, 'epoch': 2} {'type': 'loss', 'content': 0.02553441934287548, 'timestamp': '2025-09-15 03:20:32.476264', 'step': 1505, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:32.506486', 'step': 1505, 'epoch': 2} {'type': 'loss', 'content': 0.027252767235040665, 'timestamp': '2025-09-15 03:20:32.508477', 'step': 1506, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:32.538637', 'step': 1506, 'epoch': 2} {'type': 'loss', 'content': 0.030122382566332817, 'timestamp': '2025-09-15 03:20:32.540479', 'step': 1507, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:20:32.571620', 'step': 1507, 'epoch': 2} {'type': 'loss', 'content': 0.017167910933494568, 'timestamp': '2025-09-15 03:20:32.596530', 'step': 1508, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:32.626498', 'step': 1508, 'epoch': 2} {'type': 'loss', 'content': 0.008571980521082878, 'timestamp': '2025-09-15 03:20:32.628596', 'step': 1509, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:32.658742', 'step': 1509, 'epoch': 2} {'type': 'loss', 'content': 0.018993262201547623, 'timestamp': '2025-09-15 03:20:32.660773', 'step': 1510, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:32.691130', 'step': 1510, 'epoch': 2} {'type': 'loss', 'content': 0.0016351805534213781, 'timestamp': '2025-09-15 03:20:32.693101', 'step': 1511, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:32.723715', 'step': 1511, 'epoch': 2} {'type': 'loss', 'content': 0.013522188179194927, 'timestamp': '2025-09-15 03:20:32.747448', 'step': 1512, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:32.777603', 'step': 1512, 'epoch': 2} {'type': 'loss', 'content': 0.01635715551674366, 'timestamp': '2025-09-15 03:20:32.779678', 'step': 1513, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:32.810168', 'step': 1513, 'epoch': 2} {'type': 'loss', 'content': 0.018260132521390915, 'timestamp': '2025-09-15 03:20:32.812231', 'step': 1514, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:32.842217', 'step': 1514, 'epoch': 2} {'type': 'loss', 'content': 0.008778615854680538, 'timestamp': '2025-09-15 03:20:32.844246', 'step': 1515, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:32.874101', 'step': 1515, 'epoch': 2} {'type': 'loss', 'content': 0.04932339861989021, 'timestamp': '2025-09-15 03:20:32.897674', 'step': 1516, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:32.927801', 'step': 1516, 'epoch': 2} {'type': 'loss', 'content': 0.00995201151818037, 'timestamp': '2025-09-15 03:20:32.929887', 'step': 1517, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:32.960313', 'step': 1517, 'epoch': 2} {'type': 'loss', 'content': 0.02796383574604988, 'timestamp': '2025-09-15 03:20:32.962499', 'step': 1518, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:32.992497', 'step': 1518, 'epoch': 2} {'type': 'loss', 'content': 0.015218219719827175, 'timestamp': '2025-09-15 03:20:32.999792', 'step': 1519, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:33.032125', 'step': 1519, 'epoch': 2} {'type': 'loss', 'content': 0.0046564312651753426, 'timestamp': '2025-09-15 03:20:33.055587', 'step': 1520, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:33.085373', 'step': 1520, 'epoch': 2} {'type': 'loss', 'content': 0.015503061935305595, 'timestamp': '2025-09-15 03:20:33.087396', 'step': 1521, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:33.118251', 'step': 1521, 'epoch': 2} {'type': 'loss', 'content': 0.007008845917880535, 'timestamp': '2025-09-15 03:20:33.120476', 'step': 1522, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:33.150485', 'step': 1522, 'epoch': 2} {'type': 'loss', 'content': 0.01042736042290926, 'timestamp': '2025-09-15 03:20:33.152513', 'step': 1523, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:33.182214', 'step': 1523, 'epoch': 2} {'type': 'loss', 'content': 0.01703822799026966, 'timestamp': '2025-09-15 03:20:33.205770', 'step': 1524, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:33.235781', 'step': 1524, 'epoch': 2} {'type': 'loss', 'content': 0.015462229959666729, 'timestamp': '2025-09-15 03:20:33.237778', 'step': 1525, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:33.269916', 'step': 1525, 'epoch': 2} {'type': 'loss', 'content': 0.012268836610019207, 'timestamp': '2025-09-15 03:20:33.271964', 'step': 1526, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:33.302214', 'step': 1526, 'epoch': 2} {'type': 'loss', 'content': 0.012304599396884441, 'timestamp': '2025-09-15 03:20:33.304579', 'step': 1527, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:33.334831', 'step': 1527, 'epoch': 2} {'type': 'loss', 'content': 0.03490256145596504, 'timestamp': '2025-09-15 03:20:33.358375', 'step': 1528, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:33.389082', 'step': 1528, 'epoch': 2} {'type': 'loss', 'content': 0.028470784425735474, 'timestamp': '2025-09-15 03:20:33.391015', 'step': 1529, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:33.421119', 'step': 1529, 'epoch': 2} {'type': 'loss', 'content': 0.02890247479081154, 'timestamp': '2025-09-15 03:20:33.423079', 'step': 1530, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:33.453882', 'step': 1530, 'epoch': 2} {'type': 'loss', 'content': 0.007559936959296465, 'timestamp': '2025-09-15 03:20:33.456034', 'step': 1531, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:33.485878', 'step': 1531, 'epoch': 2} {'type': 'loss', 'content': 0.02463674731552601, 'timestamp': '2025-09-15 03:20:33.509906', 'step': 1532, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:33.539500', 'step': 1532, 'epoch': 2} {'type': 'loss', 'content': 0.016762617975473404, 'timestamp': '2025-09-15 03:20:33.541563', 'step': 1533, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:33.572423', 'step': 1533, 'epoch': 2} {'type': 'loss', 'content': 0.028797050938010216, 'timestamp': '2025-09-15 03:20:33.574442', 'step': 1534, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:33.604887', 'step': 1534, 'epoch': 2} {'type': 'loss', 'content': 0.02649320848286152, 'timestamp': '2025-09-15 03:20:33.607124', 'step': 1535, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:33.637901', 'step': 1535, 'epoch': 2} {'type': 'loss', 'content': 0.014305144548416138, 'timestamp': '2025-09-15 03:20:33.661244', 'step': 1536, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:33.691317', 'step': 1536, 'epoch': 2} {'type': 'loss', 'content': 0.007026375271379948, 'timestamp': '2025-09-15 03:20:33.693222', 'step': 1537, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:33.723287', 'step': 1537, 'epoch': 2} {'type': 'loss', 'content': 0.03457867354154587, 'timestamp': '2025-09-15 03:20:33.725545', 'step': 1538, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:33.755631', 'step': 1538, 'epoch': 2} {'type': 'loss', 'content': 0.004672153387218714, 'timestamp': '2025-09-15 03:20:33.757686', 'step': 1539, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:20:34.478807', 'step': 1539, 'epoch': 2} {'type': 'pplx', 'content': 87727919.03047407, 'timestamp': '2025-09-15 03:20:34.480626', 'step': 1539, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:34.509237', 'step': 1539, 'epoch': 2} {'type': 'loss', 'content': 0.014389991760253906, 'timestamp': '2025-09-15 03:20:34.532819', 'step': 1540, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:34.562765', 'step': 1540, 'epoch': 2} {'type': 'loss', 'content': 0.03513328358530998, 'timestamp': '2025-09-15 03:20:34.564976', 'step': 1541, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:34.594929', 'step': 1541, 'epoch': 2} {'type': 'loss', 'content': 0.014334970153868198, 'timestamp': '2025-09-15 03:20:34.596912', 'step': 1542, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:34.627022', 'step': 1542, 'epoch': 2} {'type': 'loss', 'content': 0.0023369737900793552, 'timestamp': '2025-09-15 03:20:34.629179', 'step': 1543, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:34.661102', 'step': 1543, 'epoch': 2} {'type': 'loss', 'content': 0.0029845726676285267, 'timestamp': '2025-09-15 03:20:34.684666', 'step': 1544, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:34.714562', 'step': 1544, 'epoch': 2} {'type': 'loss', 'content': 0.006009969394654036, 'timestamp': '2025-09-15 03:20:34.716749', 'step': 1545, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:34.746941', 'step': 1545, 'epoch': 2} {'type': 'loss', 'content': 0.045693524181842804, 'timestamp': '2025-09-15 03:20:34.748985', 'step': 1546, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:34.777963', 'step': 1546, 'epoch': 2} {'type': 'loss', 'content': 0.04461913928389549, 'timestamp': '2025-09-15 03:20:34.779988', 'step': 1547, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:34.809877', 'step': 1547, 'epoch': 2} {'type': 'loss', 'content': 0.005639377050101757, 'timestamp': '2025-09-15 03:20:34.833176', 'step': 1548, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:34.863635', 'step': 1548, 'epoch': 2} {'type': 'loss', 'content': 0.0031557695474475622, 'timestamp': '2025-09-15 03:20:34.865557', 'step': 1549, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:34.895729', 'step': 1549, 'epoch': 2} {'type': 'loss', 'content': 0.0018140083411708474, 'timestamp': '2025-09-15 03:20:34.898667', 'step': 1550, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:34.928439', 'step': 1550, 'epoch': 2} {'type': 'loss', 'content': 0.009812816977500916, 'timestamp': '2025-09-15 03:20:34.930575', 'step': 1551, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:34.960557', 'step': 1551, 'epoch': 2} {'type': 'loss', 'content': 0.01942797377705574, 'timestamp': '2025-09-15 03:20:34.984241', 'step': 1552, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:35.014609', 'step': 1552, 'epoch': 2} {'type': 'loss', 'content': 0.007379674352705479, 'timestamp': '2025-09-15 03:20:35.016594', 'step': 1553, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:35.046691', 'step': 1553, 'epoch': 2} {'type': 'loss', 'content': 0.0004087972338311374, 'timestamp': '2025-09-15 03:20:35.048855', 'step': 1554, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:35.078891', 'step': 1554, 'epoch': 2} {'type': 'loss', 'content': 0.038724932819604874, 'timestamp': '2025-09-15 03:20:35.080968', 'step': 1555, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:35.110396', 'step': 1555, 'epoch': 2} {'type': 'loss', 'content': 0.024620456621050835, 'timestamp': '2025-09-15 03:20:35.133766', 'step': 1556, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:35.164411', 'step': 1556, 'epoch': 2} {'type': 'loss', 'content': 0.0014810477150604129, 'timestamp': '2025-09-15 03:20:35.166483', 'step': 1557, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:35.196324', 'step': 1557, 'epoch': 2} {'type': 'loss', 'content': 0.005852595437318087, 'timestamp': '2025-09-15 03:20:35.198654', 'step': 1558, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:35.229262', 'step': 1558, 'epoch': 2} {'type': 'loss', 'content': 0.008139959536492825, 'timestamp': '2025-09-15 03:20:35.231397', 'step': 1559, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:35.261123', 'step': 1559, 'epoch': 2} {'type': 'loss', 'content': 0.002672811970114708, 'timestamp': '2025-09-15 03:20:35.284481', 'step': 1560, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:35.314509', 'step': 1560, 'epoch': 2} {'type': 'loss', 'content': 0.02692270092666149, 'timestamp': '2025-09-15 03:20:35.316712', 'step': 1561, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:35.346736', 'step': 1561, 'epoch': 2} {'type': 'loss', 'content': 0.004136955831199884, 'timestamp': '2025-09-15 03:20:35.348698', 'step': 1562, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:35.378193', 'step': 1562, 'epoch': 2} {'type': 'loss', 'content': 0.021433161571621895, 'timestamp': '2025-09-15 03:20:35.380290', 'step': 1563, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:35.409743', 'step': 1563, 'epoch': 2} {'type': 'loss', 'content': 0.00741331884637475, 'timestamp': '2025-09-15 03:20:35.433291', 'step': 1564, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:35.464055', 'step': 1564, 'epoch': 2} {'type': 'loss', 'content': 0.0038252919912338257, 'timestamp': '2025-09-15 03:20:35.466176', 'step': 1565, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:35.496187', 'step': 1565, 'epoch': 2} {'type': 'loss', 'content': 0.0054579381830990314, 'timestamp': '2025-09-15 03:20:35.498182', 'step': 1566, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:35.527562', 'step': 1566, 'epoch': 2} {'type': 'loss', 'content': 0.0015629673143848777, 'timestamp': '2025-09-15 03:20:35.529903', 'step': 1567, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:35.560546', 'step': 1567, 'epoch': 2} {'type': 'loss', 'content': 0.020011266693472862, 'timestamp': '2025-09-15 03:20:35.584009', 'step': 1568, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:35.613816', 'step': 1568, 'epoch': 2} {'type': 'loss', 'content': 0.005876597017049789, 'timestamp': '2025-09-15 03:20:35.615820', 'step': 1569, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:35.645785', 'step': 1569, 'epoch': 2} {'type': 'loss', 'content': 0.013301116414368153, 'timestamp': '2025-09-15 03:20:35.648118', 'step': 1570, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:35.677829', 'step': 1570, 'epoch': 2} {'type': 'loss', 'content': 0.015047280117869377, 'timestamp': '2025-09-15 03:20:35.679844', 'step': 1571, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:35.709852', 'step': 1571, 'epoch': 2} {'type': 'loss', 'content': 0.005345091689378023, 'timestamp': '2025-09-15 03:20:35.733408', 'step': 1572, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:35.764632', 'step': 1572, 'epoch': 2} {'type': 'loss', 'content': 0.003994218073785305, 'timestamp': '2025-09-15 03:20:35.766847', 'step': 1573, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:35.796720', 'step': 1573, 'epoch': 2} {'type': 'loss', 'content': 0.0005644602351821959, 'timestamp': '2025-09-15 03:20:35.798752', 'step': 1574, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:35.829468', 'step': 1574, 'epoch': 2} {'type': 'loss', 'content': 0.016421610489487648, 'timestamp': '2025-09-15 03:20:35.831600', 'step': 1575, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:35.862268', 'step': 1575, 'epoch': 2} {'type': 'loss', 'content': 0.0015365204308182001, 'timestamp': '2025-09-15 03:20:35.885744', 'step': 1576, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:35.917981', 'step': 1576, 'epoch': 2} {'type': 'loss', 'content': 0.007548271678388119, 'timestamp': '2025-09-15 03:20:35.922096', 'step': 1577, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:35.953143', 'step': 1577, 'epoch': 2} {'type': 'loss', 'content': 0.020572826266288757, 'timestamp': '2025-09-15 03:20:35.955416', 'step': 1578, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:35.985754', 'step': 1578, 'epoch': 2} {'type': 'loss', 'content': 0.0013158052461221814, 'timestamp': '2025-09-15 03:20:35.987871', 'step': 1579, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:36.018117', 'step': 1579, 'epoch': 2} {'type': 'loss', 'content': 0.010661915875971317, 'timestamp': '2025-09-15 03:20:36.041489', 'step': 1580, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:36.073728', 'step': 1580, 'epoch': 2} {'type': 'loss', 'content': 0.01249182689934969, 'timestamp': '2025-09-15 03:20:36.075661', 'step': 1581, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:36.105197', 'step': 1581, 'epoch': 2} {'type': 'loss', 'content': 0.0041304114274680614, 'timestamp': '2025-09-15 03:20:36.107374', 'step': 1582, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:36.137254', 'step': 1582, 'epoch': 2} {'type': 'loss', 'content': 0.008346246555447578, 'timestamp': '2025-09-15 03:20:36.139348', 'step': 1583, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:36.170728', 'step': 1583, 'epoch': 2} {'type': 'loss', 'content': 0.0073064700700342655, 'timestamp': '2025-09-15 03:20:36.194207', 'step': 1584, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:36.224480', 'step': 1584, 'epoch': 2} {'type': 'loss', 'content': 0.009679530747234821, 'timestamp': '2025-09-15 03:20:36.226678', 'step': 1585, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:36.256676', 'step': 1585, 'epoch': 2} {'type': 'loss', 'content': 0.023354899138212204, 'timestamp': '2025-09-15 03:20:36.258595', 'step': 1586, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:36.288494', 'step': 1586, 'epoch': 2} {'type': 'loss', 'content': 0.04176362231373787, 'timestamp': '2025-09-15 03:20:36.290651', 'step': 1587, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:36.320296', 'step': 1587, 'epoch': 2} {'type': 'loss', 'content': 0.0026484071277081966, 'timestamp': '2025-09-15 03:20:36.343661', 'step': 1588, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:36.374183', 'step': 1588, 'epoch': 2} {'type': 'loss', 'content': 0.01016245223581791, 'timestamp': '2025-09-15 03:20:36.376232', 'step': 1589, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:36.406830', 'step': 1589, 'epoch': 2} {'type': 'loss', 'content': 0.016892336308956146, 'timestamp': '2025-09-15 03:20:36.408872', 'step': 1590, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:36.438869', 'step': 1590, 'epoch': 2} {'type': 'loss', 'content': 0.03908044844865799, 'timestamp': '2025-09-15 03:20:36.441365', 'step': 1591, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:36.471395', 'step': 1591, 'epoch': 2} {'type': 'loss', 'content': 0.059734683483839035, 'timestamp': '2025-09-15 03:20:36.494739', 'step': 1592, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:36.524810', 'step': 1592, 'epoch': 2} {'type': 'loss', 'content': 0.008856425061821938, 'timestamp': '2025-09-15 03:20:36.526888', 'step': 1593, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:36.556640', 'step': 1593, 'epoch': 2} {'type': 'loss', 'content': 0.008398907259106636, 'timestamp': '2025-09-15 03:20:36.558489', 'step': 1594, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:36.589197', 'step': 1594, 'epoch': 2} {'type': 'loss', 'content': 0.015620146878063679, 'timestamp': '2025-09-15 03:20:36.591171', 'step': 1595, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:36.620925', 'step': 1595, 'epoch': 2} {'type': 'loss', 'content': 0.01876661740243435, 'timestamp': '2025-09-15 03:20:36.644417', 'step': 1596, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:20:37.356740', 'step': 1596, 'epoch': 2} {'type': 'pplx', 'content': 75713161.11239102, 'timestamp': '2025-09-15 03:20:37.358839', 'step': 1596, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:37.386785', 'step': 1596, 'epoch': 2} {'type': 'loss', 'content': 0.005393002647906542, 'timestamp': '2025-09-15 03:20:37.388893', 'step': 1597, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:37.418998', 'step': 1597, 'epoch': 2} {'type': 'loss', 'content': 0.00764166796579957, 'timestamp': '2025-09-15 03:20:37.422029', 'step': 1598, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:37.452195', 'step': 1598, 'epoch': 2} {'type': 'loss', 'content': 0.01738116517663002, 'timestamp': '2025-09-15 03:20:37.456104', 'step': 1599, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:37.487884', 'step': 1599, 'epoch': 2} {'type': 'loss', 'content': 0.010397247970104218, 'timestamp': '2025-09-15 03:20:37.511360', 'step': 1600, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:37.541090', 'step': 1600, 'epoch': 2} {'type': 'loss', 'content': 0.0032066632993519306, 'timestamp': '2025-09-15 03:20:37.543093', 'step': 1601, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:37.573301', 'step': 1601, 'epoch': 2} {'type': 'loss', 'content': 0.031752921640872955, 'timestamp': '2025-09-15 03:20:37.575458', 'step': 1602, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:37.605808', 'step': 1602, 'epoch': 2} {'type': 'loss', 'content': 0.018547099083662033, 'timestamp': '2025-09-15 03:20:37.607971', 'step': 1603, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:37.638177', 'step': 1603, 'epoch': 2} {'type': 'loss', 'content': 0.0020383193623274565, 'timestamp': '2025-09-15 03:20:37.661681', 'step': 1604, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:37.691809', 'step': 1604, 'epoch': 2} {'type': 'loss', 'content': 0.005079983733594418, 'timestamp': '2025-09-15 03:20:37.693892', 'step': 1605, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:37.723693', 'step': 1605, 'epoch': 2} {'type': 'loss', 'content': 0.03111599199473858, 'timestamp': '2025-09-15 03:20:37.725664', 'step': 1606, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:37.755590', 'step': 1606, 'epoch': 2} {'type': 'loss', 'content': 0.00759871955960989, 'timestamp': '2025-09-15 03:20:37.757792', 'step': 1607, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:37.788562', 'step': 1607, 'epoch': 2} {'type': 'loss', 'content': 0.00797184742987156, 'timestamp': '2025-09-15 03:20:37.811982', 'step': 1608, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:37.841919', 'step': 1608, 'epoch': 2} {'type': 'loss', 'content': 0.006208633538335562, 'timestamp': '2025-09-15 03:20:37.843901', 'step': 1609, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:37.874674', 'step': 1609, 'epoch': 2} {'type': 'loss', 'content': 0.026710880920290947, 'timestamp': '2025-09-15 03:20:37.876830', 'step': 1610, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:37.906920', 'step': 1610, 'epoch': 2} {'type': 'loss', 'content': 0.036108966916799545, 'timestamp': '2025-09-15 03:20:37.909005', 'step': 1611, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:37.939214', 'step': 1611, 'epoch': 2} {'type': 'loss', 'content': 0.0037150667048990726, 'timestamp': '2025-09-15 03:20:37.962677', 'step': 1612, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:37.993605', 'step': 1612, 'epoch': 2} {'type': 'loss', 'content': 0.013364890590310097, 'timestamp': '2025-09-15 03:20:37.996141', 'step': 1613, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:38.027672', 'step': 1613, 'epoch': 2} {'type': 'loss', 'content': 0.007008062209933996, 'timestamp': '2025-09-15 03:20:38.029877', 'step': 1614, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:38.059989', 'step': 1614, 'epoch': 2} {'type': 'loss', 'content': 0.0037146012764424086, 'timestamp': '2025-09-15 03:20:38.062242', 'step': 1615, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:38.092622', 'step': 1615, 'epoch': 2} {'type': 'loss', 'content': 0.01745392195880413, 'timestamp': '2025-09-15 03:20:38.116174', 'step': 1616, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:38.146420', 'step': 1616, 'epoch': 2} {'type': 'loss', 'content': 0.01895240880548954, 'timestamp': '2025-09-15 03:20:38.148505', 'step': 1617, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:38.179417', 'step': 1617, 'epoch': 2} {'type': 'loss', 'content': 0.0034704965073615313, 'timestamp': '2025-09-15 03:20:38.181572', 'step': 1618, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:38.211278', 'step': 1618, 'epoch': 2} {'type': 'loss', 'content': 0.009162304922938347, 'timestamp': '2025-09-15 03:20:38.213647', 'step': 1619, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:38.244105', 'step': 1619, 'epoch': 2} {'type': 'loss', 'content': 0.017583999782800674, 'timestamp': '2025-09-15 03:20:38.267465', 'step': 1620, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:38.297701', 'step': 1620, 'epoch': 2} {'type': 'loss', 'content': 0.007137224078178406, 'timestamp': '2025-09-15 03:20:38.299808', 'step': 1621, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:38.329946', 'step': 1621, 'epoch': 2} {'type': 'loss', 'content': 0.01079423725605011, 'timestamp': '2025-09-15 03:20:38.332273', 'step': 1622, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:38.362321', 'step': 1622, 'epoch': 2} {'type': 'loss', 'content': 0.001529446104541421, 'timestamp': '2025-09-15 03:20:38.364410', 'step': 1623, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:38.395059', 'step': 1623, 'epoch': 2} {'type': 'loss', 'content': 0.0021572206169366837, 'timestamp': '2025-09-15 03:20:38.418473', 'step': 1624, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:38.448592', 'step': 1624, 'epoch': 2} {'type': 'loss', 'content': 0.010212992317974567, 'timestamp': '2025-09-15 03:20:38.450798', 'step': 1625, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:38.480604', 'step': 1625, 'epoch': 2} {'type': 'loss', 'content': 0.00565464049577713, 'timestamp': '2025-09-15 03:20:38.482563', 'step': 1626, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:38.512439', 'step': 1626, 'epoch': 2} {'type': 'loss', 'content': 0.048854947090148926, 'timestamp': '2025-09-15 03:20:38.514561', 'step': 1627, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:20:38.544278', 'step': 1627, 'epoch': 2} {'type': 'loss', 'content': 0.008677697740495205, 'timestamp': '2025-09-15 03:20:38.568169', 'step': 1628, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:38.604878', 'step': 1628, 'epoch': 2} {'type': 'loss', 'content': 0.017185676842927933, 'timestamp': '2025-09-15 03:20:38.606911', 'step': 1629, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:38.642142', 'step': 1629, 'epoch': 2} {'type': 'loss', 'content': 0.00775103410705924, 'timestamp': '2025-09-15 03:20:38.644605', 'step': 1630, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:38.677289', 'step': 1630, 'epoch': 2} {'type': 'loss', 'content': 0.008807477541267872, 'timestamp': '2025-09-15 03:20:38.679421', 'step': 1631, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:38.709461', 'step': 1631, 'epoch': 2} {'type': 'loss', 'content': 0.029624175280332565, 'timestamp': '2025-09-15 03:20:38.732945', 'step': 1632, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:38.763295', 'step': 1632, 'epoch': 2} {'type': 'loss', 'content': 0.017720164731144905, 'timestamp': '2025-09-15 03:20:38.765594', 'step': 1633, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:38.795122', 'step': 1633, 'epoch': 2} {'type': 'loss', 'content': 0.006194696761667728, 'timestamp': '2025-09-15 03:20:38.797302', 'step': 1634, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:38.826968', 'step': 1634, 'epoch': 2} {'type': 'loss', 'content': 0.010376783087849617, 'timestamp': '2025-09-15 03:20:38.829059', 'step': 1635, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:38.859224', 'step': 1635, 'epoch': 2} {'type': 'loss', 'content': 0.008767379447817802, 'timestamp': '2025-09-15 03:20:38.888963', 'step': 1636, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:38.919455', 'step': 1636, 'epoch': 2} {'type': 'loss', 'content': 0.020663250237703323, 'timestamp': '2025-09-15 03:20:38.921569', 'step': 1637, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:38.951323', 'step': 1637, 'epoch': 2} {'type': 'loss', 'content': 0.0013582947431132197, 'timestamp': '2025-09-15 03:20:38.953791', 'step': 1638, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:38.984311', 'step': 1638, 'epoch': 2} {'type': 'loss', 'content': 0.002349320100620389, 'timestamp': '2025-09-15 03:20:38.986506', 'step': 1639, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:39.016196', 'step': 1639, 'epoch': 2} {'type': 'loss', 'content': 0.004257179331034422, 'timestamp': '2025-09-15 03:20:39.039577', 'step': 1640, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:39.069141', 'step': 1640, 'epoch': 2} {'type': 'loss', 'content': 0.02164802886545658, 'timestamp': '2025-09-15 03:20:39.071310', 'step': 1641, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:39.101676', 'step': 1641, 'epoch': 2} {'type': 'loss', 'content': 0.018588954582810402, 'timestamp': '2025-09-15 03:20:39.104122', 'step': 1642, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:39.134863', 'step': 1642, 'epoch': 2} {'type': 'loss', 'content': 0.007363787852227688, 'timestamp': '2025-09-15 03:20:39.137090', 'step': 1643, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:39.167079', 'step': 1643, 'epoch': 2} {'type': 'loss', 'content': 0.012514977715909481, 'timestamp': '2025-09-15 03:20:39.190666', 'step': 1644, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:39.220712', 'step': 1644, 'epoch': 2} {'type': 'loss', 'content': 0.008080360479652882, 'timestamp': '2025-09-15 03:20:39.222765', 'step': 1645, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:39.253110', 'step': 1645, 'epoch': 2} {'type': 'loss', 'content': 0.007632083725184202, 'timestamp': '2025-09-15 03:20:39.255766', 'step': 1646, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:39.285442', 'step': 1646, 'epoch': 2} {'type': 'loss', 'content': 0.007638550829142332, 'timestamp': '2025-09-15 03:20:39.287989', 'step': 1647, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:39.318621', 'step': 1647, 'epoch': 2} {'type': 'loss', 'content': 0.009446695446968079, 'timestamp': '2025-09-15 03:20:39.342132', 'step': 1648, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:39.372016', 'step': 1648, 'epoch': 2} {'type': 'loss', 'content': 0.018892496824264526, 'timestamp': '2025-09-15 03:20:39.374197', 'step': 1649, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:39.405886', 'step': 1649, 'epoch': 2} {'type': 'loss', 'content': 0.007213030010461807, 'timestamp': '2025-09-15 03:20:39.408188', 'step': 1650, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:39.438502', 'step': 1650, 'epoch': 2} {'type': 'loss', 'content': 0.0050399452447891235, 'timestamp': '2025-09-15 03:20:39.440600', 'step': 1651, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:39.470452', 'step': 1651, 'epoch': 2} {'type': 'loss', 'content': 0.0027350725140422583, 'timestamp': '2025-09-15 03:20:39.493928', 'step': 1652, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:39.523699', 'step': 1652, 'epoch': 2} {'type': 'loss', 'content': 0.017289655283093452, 'timestamp': '2025-09-15 03:20:39.525776', 'step': 1653, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:20:40.234928', 'step': 1653, 'epoch': 2} {'type': 'pplx', 'content': 78568448.51048347, 'timestamp': '2025-09-15 03:20:40.237391', 'step': 1653, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:40.265970', 'step': 1653, 'epoch': 2} {'type': 'loss', 'content': 0.010751119814813137, 'timestamp': '2025-09-15 03:20:40.268607', 'step': 1654, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:40.298419', 'step': 1654, 'epoch': 2} {'type': 'loss', 'content': 0.006664137355983257, 'timestamp': '2025-09-15 03:20:40.300736', 'step': 1655, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:40.330983', 'step': 1655, 'epoch': 2} {'type': 'loss', 'content': 0.005810289643704891, 'timestamp': '2025-09-15 03:20:40.354617', 'step': 1656, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:40.391034', 'step': 1656, 'epoch': 2} {'type': 'loss', 'content': 0.024757912382483482, 'timestamp': '2025-09-15 03:20:40.393142', 'step': 1657, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:40.424332', 'step': 1657, 'epoch': 2} {'type': 'loss', 'content': 0.024632303044199944, 'timestamp': '2025-09-15 03:20:40.426236', 'step': 1658, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:40.456340', 'step': 1658, 'epoch': 2} {'type': 'loss', 'content': 0.008729967288672924, 'timestamp': '2025-09-15 03:20:40.458998', 'step': 1659, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:40.488944', 'step': 1659, 'epoch': 2} {'type': 'loss', 'content': 0.006651734001934528, 'timestamp': '2025-09-15 03:20:40.513017', 'step': 1660, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:40.543613', 'step': 1660, 'epoch': 2} {'type': 'loss', 'content': 0.02994653768837452, 'timestamp': '2025-09-15 03:20:40.545667', 'step': 1661, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:40.575843', 'step': 1661, 'epoch': 2} {'type': 'loss', 'content': 0.009992452338337898, 'timestamp': '2025-09-15 03:20:40.578324', 'step': 1662, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:40.608450', 'step': 1662, 'epoch': 2} {'type': 'loss', 'content': 0.007453125435858965, 'timestamp': '2025-09-15 03:20:40.610548', 'step': 1663, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:40.640807', 'step': 1663, 'epoch': 2} {'type': 'loss', 'content': 0.003623406635597348, 'timestamp': '2025-09-15 03:20:40.664292', 'step': 1664, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:40.693917', 'step': 1664, 'epoch': 2} {'type': 'loss', 'content': 0.013959839940071106, 'timestamp': '2025-09-15 03:20:40.695814', 'step': 1665, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:40.726574', 'step': 1665, 'epoch': 2} {'type': 'loss', 'content': 0.008011700585484505, 'timestamp': '2025-09-15 03:20:40.728630', 'step': 1666, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:40.758656', 'step': 1666, 'epoch': 2} {'type': 'loss', 'content': 0.01187701802700758, 'timestamp': '2025-09-15 03:20:40.760910', 'step': 1667, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:40.791848', 'step': 1667, 'epoch': 2} {'type': 'loss', 'content': 0.015608700923621655, 'timestamp': '2025-09-15 03:20:40.815628', 'step': 1668, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:40.846081', 'step': 1668, 'epoch': 2} {'type': 'loss', 'content': 0.04297996684908867, 'timestamp': '2025-09-15 03:20:40.848291', 'step': 1669, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:40.878298', 'step': 1669, 'epoch': 2} {'type': 'loss', 'content': 0.005474329926073551, 'timestamp': '2025-09-15 03:20:40.880300', 'step': 1670, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:40.910196', 'step': 1670, 'epoch': 2} {'type': 'loss', 'content': 0.00804048590362072, 'timestamp': '2025-09-15 03:20:40.912238', 'step': 1671, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:40.941921', 'step': 1671, 'epoch': 2} {'type': 'loss', 'content': 0.006009248550981283, 'timestamp': '2025-09-15 03:20:40.966692', 'step': 1672, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:40.997034', 'step': 1672, 'epoch': 2} {'type': 'loss', 'content': 0.011756404303014278, 'timestamp': '2025-09-15 03:20:41.001791', 'step': 1673, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:41.034752', 'step': 1673, 'epoch': 2} {'type': 'loss', 'content': 0.0031918776221573353, 'timestamp': '2025-09-15 03:20:41.037069', 'step': 1674, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:41.067133', 'step': 1674, 'epoch': 2} {'type': 'loss', 'content': 0.0008713051793165505, 'timestamp': '2025-09-15 03:20:41.069415', 'step': 1675, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:41.099794', 'step': 1675, 'epoch': 2} {'type': 'loss', 'content': 0.003177103353664279, 'timestamp': '2025-09-15 03:20:41.123206', 'step': 1676, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:41.152822', 'step': 1676, 'epoch': 2} {'type': 'loss', 'content': 0.0052427686750888824, 'timestamp': '2025-09-15 03:20:41.154997', 'step': 1677, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:41.184416', 'step': 1677, 'epoch': 2} {'type': 'loss', 'content': 0.002482310403138399, 'timestamp': '2025-09-15 03:20:41.186424', 'step': 1678, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:20:41.216301', 'step': 1678, 'epoch': 2} {'type': 'loss', 'content': 0.008572629652917385, 'timestamp': '2025-09-15 03:20:41.218792', 'step': 1679, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:41.249469', 'step': 1679, 'epoch': 2} {'type': 'loss', 'content': 0.015897506847977638, 'timestamp': '2025-09-15 03:20:41.273375', 'step': 1680, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:41.303511', 'step': 1680, 'epoch': 2} {'type': 'loss', 'content': 0.005093819461762905, 'timestamp': '2025-09-15 03:20:41.305513', 'step': 1681, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:41.336699', 'step': 1681, 'epoch': 2} {'type': 'loss', 'content': 0.003218486439436674, 'timestamp': '2025-09-15 03:20:41.338853', 'step': 1682, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:41.369209', 'step': 1682, 'epoch': 2} {'type': 'loss', 'content': 0.00489377835765481, 'timestamp': '2025-09-15 03:20:41.371412', 'step': 1683, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:41.401557', 'step': 1683, 'epoch': 2} {'type': 'loss', 'content': 0.0022957702167332172, 'timestamp': '2025-09-15 03:20:41.425070', 'step': 1684, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:41.455113', 'step': 1684, 'epoch': 2} {'type': 'loss', 'content': 0.01362029928714037, 'timestamp': '2025-09-15 03:20:41.457648', 'step': 1685, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:41.487373', 'step': 1685, 'epoch': 2} {'type': 'loss', 'content': 0.006150937173515558, 'timestamp': '2025-09-15 03:20:41.489644', 'step': 1686, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:41.519530', 'step': 1686, 'epoch': 2} {'type': 'loss', 'content': 0.03606478497385979, 'timestamp': '2025-09-15 03:20:41.521775', 'step': 1687, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:41.551690', 'step': 1687, 'epoch': 2} {'type': 'loss', 'content': 0.047492969781160355, 'timestamp': '2025-09-15 03:20:41.575323', 'step': 1688, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:41.605419', 'step': 1688, 'epoch': 2} {'type': 'loss', 'content': 0.0005590234650298953, 'timestamp': '2025-09-15 03:20:41.607620', 'step': 1689, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:41.638101', 'step': 1689, 'epoch': 2} {'type': 'loss', 'content': 0.04679226502776146, 'timestamp': '2025-09-15 03:20:41.640128', 'step': 1690, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:41.670723', 'step': 1690, 'epoch': 2} {'type': 'loss', 'content': 0.0004176944785285741, 'timestamp': '2025-09-15 03:20:41.672979', 'step': 1691, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:41.702924', 'step': 1691, 'epoch': 2} {'type': 'loss', 'content': 0.024234410375356674, 'timestamp': '2025-09-15 03:20:41.726381', 'step': 1692, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:41.756427', 'step': 1692, 'epoch': 2} {'type': 'loss', 'content': 0.012740524485707283, 'timestamp': '2025-09-15 03:20:41.758288', 'step': 1693, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:41.788508', 'step': 1693, 'epoch': 2} {'type': 'loss', 'content': 0.003929396625608206, 'timestamp': '2025-09-15 03:20:41.790747', 'step': 1694, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:41.820472', 'step': 1694, 'epoch': 2} {'type': 'loss', 'content': 0.0006833495572209358, 'timestamp': '2025-09-15 03:20:41.822335', 'step': 1695, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:41.851921', 'step': 1695, 'epoch': 2} {'type': 'loss', 'content': 0.04751553386449814, 'timestamp': '2025-09-15 03:20:41.875347', 'step': 1696, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:41.905416', 'step': 1696, 'epoch': 2} {'type': 'loss', 'content': 0.0025537125766277313, 'timestamp': '2025-09-15 03:20:41.907647', 'step': 1697, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:41.937629', 'step': 1697, 'epoch': 2} {'type': 'loss', 'content': 0.009955295361578465, 'timestamp': '2025-09-15 03:20:41.939876', 'step': 1698, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:41.970006', 'step': 1698, 'epoch': 2} {'type': 'loss', 'content': 0.0014152992516756058, 'timestamp': '2025-09-15 03:20:41.972056', 'step': 1699, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:42.002196', 'step': 1699, 'epoch': 2} {'type': 'loss', 'content': 0.01002996414899826, 'timestamp': '2025-09-15 03:20:42.025414', 'step': 1700, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:42.055554', 'step': 1700, 'epoch': 2} {'type': 'loss', 'content': 0.022007791325449944, 'timestamp': '2025-09-15 03:20:42.057517', 'step': 1701, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:42.087247', 'step': 1701, 'epoch': 2} {'type': 'loss', 'content': 0.00791712012141943, 'timestamp': '2025-09-15 03:20:42.090377', 'step': 1702, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:42.120682', 'step': 1702, 'epoch': 2} {'type': 'loss', 'content': 0.0015882424777373672, 'timestamp': '2025-09-15 03:20:42.122927', 'step': 1703, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:42.153266', 'step': 1703, 'epoch': 2} {'type': 'loss', 'content': 0.0013538190396502614, 'timestamp': '2025-09-15 03:20:42.176636', 'step': 1704, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:42.208301', 'step': 1704, 'epoch': 2} {'type': 'loss', 'content': 0.004893905948847532, 'timestamp': '2025-09-15 03:20:42.210359', 'step': 1705, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:42.241820', 'step': 1705, 'epoch': 2} {'type': 'loss', 'content': 0.022312074899673462, 'timestamp': '2025-09-15 03:20:42.244028', 'step': 1706, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:42.274923', 'step': 1706, 'epoch': 2} {'type': 'loss', 'content': 0.001479136641137302, 'timestamp': '2025-09-15 03:20:42.276865', 'step': 1707, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:42.307119', 'step': 1707, 'epoch': 2} {'type': 'loss', 'content': 0.004247893113642931, 'timestamp': '2025-09-15 03:20:42.330547', 'step': 1708, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:42.360325', 'step': 1708, 'epoch': 2} {'type': 'loss', 'content': 0.01009181048721075, 'timestamp': '2025-09-15 03:20:42.362291', 'step': 1709, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:42.391910', 'step': 1709, 'epoch': 2} {'type': 'loss', 'content': 0.001586645608767867, 'timestamp': '2025-09-15 03:20:42.395118', 'step': 1710, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:20:43.117474', 'step': 1710, 'epoch': 2} {'type': 'pplx', 'content': 83493489.47252946, 'timestamp': '2025-09-15 03:20:43.119598', 'step': 1710, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:43.148336', 'step': 1710, 'epoch': 2} {'type': 'loss', 'content': 0.015090351924300194, 'timestamp': '2025-09-15 03:20:43.150343', 'step': 1711, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:43.180832', 'step': 1711, 'epoch': 2} {'type': 'loss', 'content': 0.01114923320710659, 'timestamp': '2025-09-15 03:20:43.204356', 'step': 1712, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:43.234968', 'step': 1712, 'epoch': 2} {'type': 'loss', 'content': 0.006485131569206715, 'timestamp': '2025-09-15 03:20:43.237163', 'step': 1713, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:43.266997', 'step': 1713, 'epoch': 2} {'type': 'loss', 'content': 0.005379594396799803, 'timestamp': '2025-09-15 03:20:43.269041', 'step': 1714, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:43.299839', 'step': 1714, 'epoch': 2} {'type': 'loss', 'content': 0.002022078027948737, 'timestamp': '2025-09-15 03:20:43.301843', 'step': 1715, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:43.332526', 'step': 1715, 'epoch': 2} {'type': 'loss', 'content': 0.004368626978248358, 'timestamp': '2025-09-15 03:20:43.356377', 'step': 1716, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:43.387317', 'step': 1716, 'epoch': 2} {'type': 'loss', 'content': 0.03480208292603493, 'timestamp': '2025-09-15 03:20:43.389314', 'step': 1717, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:43.420851', 'step': 1717, 'epoch': 2} {'type': 'loss', 'content': 0.008094354532659054, 'timestamp': '2025-09-15 03:20:43.422898', 'step': 1718, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:43.453099', 'step': 1718, 'epoch': 2} {'type': 'loss', 'content': 0.014233270660042763, 'timestamp': '2025-09-15 03:20:43.455116', 'step': 1719, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:43.485088', 'step': 1719, 'epoch': 2} {'type': 'loss', 'content': 0.019168147817254066, 'timestamp': '2025-09-15 03:20:43.509096', 'step': 1720, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:43.538772', 'step': 1720, 'epoch': 2} {'type': 'loss', 'content': 0.0029123888816684484, 'timestamp': '2025-09-15 03:20:43.540775', 'step': 1721, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:43.570988', 'step': 1721, 'epoch': 2} {'type': 'loss', 'content': 0.010065059177577496, 'timestamp': '2025-09-15 03:20:43.573220', 'step': 1722, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:43.602709', 'step': 1722, 'epoch': 2} {'type': 'loss', 'content': 0.005632904823869467, 'timestamp': '2025-09-15 03:20:43.605318', 'step': 1723, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:43.635486', 'step': 1723, 'epoch': 2} {'type': 'loss', 'content': 0.0026095120701938868, 'timestamp': '2025-09-15 03:20:43.659098', 'step': 1724, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:43.689238', 'step': 1724, 'epoch': 2} {'type': 'loss', 'content': 0.0018347349250689149, 'timestamp': '2025-09-15 03:20:43.691287', 'step': 1725, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:43.721825', 'step': 1725, 'epoch': 2} {'type': 'loss', 'content': 0.0057741072960197926, 'timestamp': '2025-09-15 03:20:43.723926', 'step': 1726, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:43.753833', 'step': 1726, 'epoch': 2} {'type': 'loss', 'content': 0.02014450542628765, 'timestamp': '2025-09-15 03:20:43.756141', 'step': 1727, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:43.787069', 'step': 1727, 'epoch': 2} {'type': 'loss', 'content': 0.003958654589951038, 'timestamp': '2025-09-15 03:20:43.810292', 'step': 1728, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:43.841359', 'step': 1728, 'epoch': 2} {'type': 'loss', 'content': 0.003618575632572174, 'timestamp': '2025-09-15 03:20:43.843511', 'step': 1729, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:43.873413', 'step': 1729, 'epoch': 2} {'type': 'loss', 'content': 0.0016862701158970594, 'timestamp': '2025-09-15 03:20:43.875673', 'step': 1730, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:43.905561', 'step': 1730, 'epoch': 2} {'type': 'loss', 'content': 0.0172065868973732, 'timestamp': '2025-09-15 03:20:43.907718', 'step': 1731, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:43.938296', 'step': 1731, 'epoch': 2} {'type': 'loss', 'content': 0.0003309242019895464, 'timestamp': '2025-09-15 03:20:43.961829', 'step': 1732, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:43.991898', 'step': 1732, 'epoch': 2} {'type': 'loss', 'content': 0.0021879069972783327, 'timestamp': '2025-09-15 03:20:43.994071', 'step': 1733, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:44.024146', 'step': 1733, 'epoch': 2} {'type': 'loss', 'content': 0.009583430364727974, 'timestamp': '2025-09-15 03:20:44.026144', 'step': 1734, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:44.056272', 'step': 1734, 'epoch': 2} {'type': 'loss', 'content': 0.006018994841724634, 'timestamp': '2025-09-15 03:20:44.058080', 'step': 1735, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:44.088023', 'step': 1735, 'epoch': 2} {'type': 'loss', 'content': 0.0028155508916825056, 'timestamp': '2025-09-15 03:20:44.111442', 'step': 1736, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:44.141841', 'step': 1736, 'epoch': 2} {'type': 'loss', 'content': 0.009163476526737213, 'timestamp': '2025-09-15 03:20:44.143912', 'step': 1737, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:44.175174', 'step': 1737, 'epoch': 2} {'type': 'loss', 'content': 0.009106485173106194, 'timestamp': '2025-09-15 03:20:44.188879', 'step': 1738, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:44.228187', 'step': 1738, 'epoch': 2} {'type': 'loss', 'content': 0.0013123898534104228, 'timestamp': '2025-09-15 03:20:44.233528', 'step': 1739, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:44.264347', 'step': 1739, 'epoch': 2} {'type': 'loss', 'content': 0.0030263494700193405, 'timestamp': '2025-09-15 03:20:44.287704', 'step': 1740, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:44.318072', 'step': 1740, 'epoch': 2} {'type': 'loss', 'content': 0.02149033546447754, 'timestamp': '2025-09-15 03:20:44.320075', 'step': 1741, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:44.353689', 'step': 1741, 'epoch': 2} {'type': 'loss', 'content': 0.0038601472042500973, 'timestamp': '2025-09-15 03:20:44.356010', 'step': 1742, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:44.386100', 'step': 1742, 'epoch': 2} {'type': 'loss', 'content': 0.0044944509863853455, 'timestamp': '2025-09-15 03:20:44.388029', 'step': 1743, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:44.417741', 'step': 1743, 'epoch': 2} {'type': 'loss', 'content': 0.011085770092904568, 'timestamp': '2025-09-15 03:20:44.442293', 'step': 1744, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:44.473336', 'step': 1744, 'epoch': 2} {'type': 'loss', 'content': 0.0032526575960218906, 'timestamp': '2025-09-15 03:20:44.475200', 'step': 1745, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:44.505177', 'step': 1745, 'epoch': 2} {'type': 'loss', 'content': 0.002188687212765217, 'timestamp': '2025-09-15 03:20:44.509598', 'step': 1746, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:44.539571', 'step': 1746, 'epoch': 2} {'type': 'loss', 'content': 0.0019085209351032972, 'timestamp': '2025-09-15 03:20:44.541532', 'step': 1747, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:44.571388', 'step': 1747, 'epoch': 2} {'type': 'loss', 'content': 0.0015842962311580777, 'timestamp': '2025-09-15 03:20:44.594903', 'step': 1748, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:44.625855', 'step': 1748, 'epoch': 2} {'type': 'loss', 'content': 0.0016496534226462245, 'timestamp': '2025-09-15 03:20:44.631515', 'step': 1749, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:44.662592', 'step': 1749, 'epoch': 2} {'type': 'loss', 'content': 0.0018758989172056317, 'timestamp': '2025-09-15 03:20:44.664945', 'step': 1750, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:44.695562', 'step': 1750, 'epoch': 2} {'type': 'loss', 'content': 0.019384147599339485, 'timestamp': '2025-09-15 03:20:44.697856', 'step': 1751, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:44.729048', 'step': 1751, 'epoch': 2} {'type': 'loss', 'content': 0.0029005541000515223, 'timestamp': '2025-09-15 03:20:44.752789', 'step': 1752, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:44.782658', 'step': 1752, 'epoch': 2} {'type': 'loss', 'content': 0.016111260280013084, 'timestamp': '2025-09-15 03:20:44.788194', 'step': 1753, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:44.823184', 'step': 1753, 'epoch': 2} {'type': 'loss', 'content': 0.0369202196598053, 'timestamp': '2025-09-15 03:20:44.826050', 'step': 1754, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:44.856300', 'step': 1754, 'epoch': 2} {'type': 'loss', 'content': 0.002441554795950651, 'timestamp': '2025-09-15 03:20:44.858273', 'step': 1755, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:44.888430', 'step': 1755, 'epoch': 2} {'type': 'loss', 'content': 0.005904212594032288, 'timestamp': '2025-09-15 03:20:44.916851', 'step': 1756, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:44.952393', 'step': 1756, 'epoch': 2} {'type': 'loss', 'content': 0.0009123924537561834, 'timestamp': '2025-09-15 03:20:44.954588', 'step': 1757, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:44.984359', 'step': 1757, 'epoch': 2} {'type': 'loss', 'content': 0.0018833059584721923, 'timestamp': '2025-09-15 03:20:44.986753', 'step': 1758, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:45.016376', 'step': 1758, 'epoch': 2} {'type': 'loss', 'content': 0.001379848108626902, 'timestamp': '2025-09-15 03:20:45.018616', 'step': 1759, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:45.048322', 'step': 1759, 'epoch': 2} {'type': 'loss', 'content': 0.025038572028279305, 'timestamp': '2025-09-15 03:20:45.071809', 'step': 1760, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:45.102091', 'step': 1760, 'epoch': 2} {'type': 'loss', 'content': 0.0022791826631873846, 'timestamp': '2025-09-15 03:20:45.105200', 'step': 1761, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:45.135687', 'step': 1761, 'epoch': 2} {'type': 'loss', 'content': 0.00048445953871123493, 'timestamp': '2025-09-15 03:20:45.138112', 'step': 1762, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:45.170663', 'step': 1762, 'epoch': 2} {'type': 'loss', 'content': 0.0004319077415857464, 'timestamp': '2025-09-15 03:20:45.172913', 'step': 1763, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:45.202911', 'step': 1763, 'epoch': 2} {'type': 'loss', 'content': 0.007473528850823641, 'timestamp': '2025-09-15 03:20:45.226569', 'step': 1764, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:45.257991', 'step': 1764, 'epoch': 2} {'type': 'loss', 'content': 0.005291355308145285, 'timestamp': '2025-09-15 03:20:45.260130', 'step': 1765, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:45.289901', 'step': 1765, 'epoch': 2} {'type': 'loss', 'content': 0.0004802733601536602, 'timestamp': '2025-09-15 03:20:45.292068', 'step': 1766, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:45.322206', 'step': 1766, 'epoch': 2} {'type': 'loss', 'content': 0.00027232806314714253, 'timestamp': '2025-09-15 03:20:45.324542', 'step': 1767, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:20:46.042368', 'step': 1767, 'epoch': 2} {'type': 'pplx', 'content': 84625213.86497217, 'timestamp': '2025-09-15 03:20:46.044474', 'step': 1767, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:46.072755', 'step': 1767, 'epoch': 2} {'type': 'loss', 'content': 0.00040455401176586747, 'timestamp': '2025-09-15 03:20:46.096280', 'step': 1768, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:46.126083', 'step': 1768, 'epoch': 2} {'type': 'loss', 'content': 0.013869697228074074, 'timestamp': '2025-09-15 03:20:46.127945', 'step': 1769, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:46.158184', 'step': 1769, 'epoch': 2} {'type': 'loss', 'content': 0.0016579412622377276, 'timestamp': '2025-09-15 03:20:46.160233', 'step': 1770, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:46.191522', 'step': 1770, 'epoch': 2} {'type': 'loss', 'content': 0.0013221147237345576, 'timestamp': '2025-09-15 03:20:46.193724', 'step': 1771, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:46.223768', 'step': 1771, 'epoch': 2} {'type': 'loss', 'content': 0.00014326020027510822, 'timestamp': '2025-09-15 03:20:46.247221', 'step': 1772, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:46.277266', 'step': 1772, 'epoch': 2} {'type': 'loss', 'content': 0.0008943701977841556, 'timestamp': '2025-09-15 03:20:46.279598', 'step': 1773, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:46.310031', 'step': 1773, 'epoch': 2} {'type': 'loss', 'content': 0.0063716270960867405, 'timestamp': '2025-09-15 03:20:46.312096', 'step': 1774, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:46.341770', 'step': 1774, 'epoch': 2} {'type': 'loss', 'content': 0.0200633704662323, 'timestamp': '2025-09-15 03:20:46.344779', 'step': 1775, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:46.374980', 'step': 1775, 'epoch': 2} {'type': 'loss', 'content': 0.0015325994463637471, 'timestamp': '2025-09-15 03:20:46.398474', 'step': 1776, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:46.430797', 'step': 1776, 'epoch': 2} {'type': 'loss', 'content': 0.0016208905726671219, 'timestamp': '2025-09-15 03:20:46.432859', 'step': 1777, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:46.462788', 'step': 1777, 'epoch': 2} {'type': 'loss', 'content': 0.0006189151317812502, 'timestamp': '2025-09-15 03:20:46.465097', 'step': 1778, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:46.495495', 'step': 1778, 'epoch': 2} {'type': 'loss', 'content': 0.0018870577914640307, 'timestamp': '2025-09-15 03:20:46.497784', 'step': 1779, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:46.527883', 'step': 1779, 'epoch': 2} {'type': 'loss', 'content': 0.0016338779823854566, 'timestamp': '2025-09-15 03:20:46.551495', 'step': 1780, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:46.581357', 'step': 1780, 'epoch': 2} {'type': 'loss', 'content': 0.0007233429932966828, 'timestamp': '2025-09-15 03:20:46.583323', 'step': 1781, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:46.613213', 'step': 1781, 'epoch': 2} {'type': 'loss', 'content': 0.0027278957422822714, 'timestamp': '2025-09-15 03:20:46.615137', 'step': 1782, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:46.645534', 'step': 1782, 'epoch': 2} {'type': 'loss', 'content': 0.0006363813881762326, 'timestamp': '2025-09-15 03:20:46.649028', 'step': 1783, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:46.679027', 'step': 1783, 'epoch': 2} {'type': 'loss', 'content': 0.0005047211307100952, 'timestamp': '2025-09-15 03:20:46.702401', 'step': 1784, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:46.732556', 'step': 1784, 'epoch': 2} {'type': 'loss', 'content': 0.0002988137421198189, 'timestamp': '2025-09-15 03:20:46.734597', 'step': 1785, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:46.764920', 'step': 1785, 'epoch': 2} {'type': 'loss', 'content': 0.0006779587711207569, 'timestamp': '2025-09-15 03:20:46.767073', 'step': 1786, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:46.796271', 'step': 1786, 'epoch': 2} {'type': 'loss', 'content': 0.016672402620315552, 'timestamp': '2025-09-15 03:20:46.798085', 'step': 1787, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:46.827670', 'step': 1787, 'epoch': 2} {'type': 'loss', 'content': 0.0010021728230640292, 'timestamp': '2025-09-15 03:20:46.851165', 'step': 1788, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:46.885174', 'step': 1788, 'epoch': 2} {'type': 'loss', 'content': 0.0002892357297241688, 'timestamp': '2025-09-15 03:20:46.887130', 'step': 1789, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:46.918801', 'step': 1789, 'epoch': 2} {'type': 'loss', 'content': 0.0004077693447470665, 'timestamp': '2025-09-15 03:20:46.921088', 'step': 1790, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:46.951079', 'step': 1790, 'epoch': 2} {'type': 'loss', 'content': 0.002524502808228135, 'timestamp': '2025-09-15 03:20:46.957439', 'step': 1791, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:46.987001', 'step': 1791, 'epoch': 2} {'type': 'loss', 'content': 0.003516556229442358, 'timestamp': '2025-09-15 03:20:47.010613', 'step': 1792, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:47.042252', 'step': 1792, 'epoch': 2} {'type': 'loss', 'content': 0.003348279744386673, 'timestamp': '2025-09-15 03:20:47.052410', 'step': 1793, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:47.085305', 'step': 1793, 'epoch': 2} {'type': 'loss', 'content': 0.0006031015072949231, 'timestamp': '2025-09-15 03:20:47.091458', 'step': 1794, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:47.123623', 'step': 1794, 'epoch': 2} {'type': 'loss', 'content': 0.000745028315577656, 'timestamp': '2025-09-15 03:20:47.125805', 'step': 1795, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:47.155477', 'step': 1795, 'epoch': 2} {'type': 'loss', 'content': 0.001737898332066834, 'timestamp': '2025-09-15 03:20:47.180130', 'step': 1796, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:47.211496', 'step': 1796, 'epoch': 2} {'type': 'loss', 'content': 0.023937616497278214, 'timestamp': '2025-09-15 03:20:47.213548', 'step': 1797, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:47.250347', 'step': 1797, 'epoch': 2} {'type': 'loss', 'content': 0.043688591569662094, 'timestamp': '2025-09-15 03:20:47.252397', 'step': 1798, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:47.283446', 'step': 1798, 'epoch': 2} {'type': 'loss', 'content': 0.02030237577855587, 'timestamp': '2025-09-15 03:20:47.285632', 'step': 1799, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:47.315678', 'step': 1799, 'epoch': 2} {'type': 'loss', 'content': 0.0310269333422184, 'timestamp': '2025-09-15 03:20:47.339295', 'step': 1800, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:47.369023', 'step': 1800, 'epoch': 2} {'type': 'loss', 'content': 0.00030304151005111635, 'timestamp': '2025-09-15 03:20:47.372188', 'step': 1801, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:47.402167', 'step': 1801, 'epoch': 2} {'type': 'loss', 'content': 0.009272119961678982, 'timestamp': '2025-09-15 03:20:47.404817', 'step': 1802, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:47.435404', 'step': 1802, 'epoch': 2} {'type': 'loss', 'content': 0.0034189175348728895, 'timestamp': '2025-09-15 03:20:47.437910', 'step': 1803, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:47.468080', 'step': 1803, 'epoch': 2} {'type': 'loss', 'content': 0.0010578955989331007, 'timestamp': '2025-09-15 03:20:47.491793', 'step': 1804, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:47.523110', 'step': 1804, 'epoch': 2} {'type': 'loss', 'content': 0.012314596213400364, 'timestamp': '2025-09-15 03:20:47.525210', 'step': 1805, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:47.554945', 'step': 1805, 'epoch': 2} {'type': 'loss', 'content': 0.002753838896751404, 'timestamp': '2025-09-15 03:20:47.556971', 'step': 1806, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:47.587803', 'step': 1806, 'epoch': 2} {'type': 'loss', 'content': 0.013189495541155338, 'timestamp': '2025-09-15 03:20:47.589892', 'step': 1807, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:47.620166', 'step': 1807, 'epoch': 2} {'type': 'loss', 'content': 0.00679773697629571, 'timestamp': '2025-09-15 03:20:47.643709', 'step': 1808, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:47.673709', 'step': 1808, 'epoch': 2} {'type': 'loss', 'content': 0.0012394858058542013, 'timestamp': '2025-09-15 03:20:47.676300', 'step': 1809, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:47.706354', 'step': 1809, 'epoch': 2} {'type': 'loss', 'content': 0.008619984611868858, 'timestamp': '2025-09-15 03:20:47.708756', 'step': 1810, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:47.738318', 'step': 1810, 'epoch': 2} {'type': 'loss', 'content': 0.004352693445980549, 'timestamp': '2025-09-15 03:20:47.740323', 'step': 1811, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:47.770745', 'step': 1811, 'epoch': 2} {'type': 'loss', 'content': 0.0032818778418004513, 'timestamp': '2025-09-15 03:20:47.794075', 'step': 1812, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:47.824938', 'step': 1812, 'epoch': 2} {'type': 'loss', 'content': 0.004312444012612104, 'timestamp': '2025-09-15 03:20:47.827045', 'step': 1813, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:47.856827', 'step': 1813, 'epoch': 2} {'type': 'loss', 'content': 0.003813502611592412, 'timestamp': '2025-09-15 03:20:47.858927', 'step': 1814, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:47.889682', 'step': 1814, 'epoch': 2} {'type': 'loss', 'content': 0.009257403202354908, 'timestamp': '2025-09-15 03:20:47.892233', 'step': 1815, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:47.923309', 'step': 1815, 'epoch': 2} {'type': 'loss', 'content': 0.0016402419423684478, 'timestamp': '2025-09-15 03:20:47.946759', 'step': 1816, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:47.976831', 'step': 1816, 'epoch': 2} {'type': 'loss', 'content': 0.0017574775265529752, 'timestamp': '2025-09-15 03:20:47.978838', 'step': 1817, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:48.008313', 'step': 1817, 'epoch': 2} {'type': 'loss', 'content': 0.05542900040745735, 'timestamp': '2025-09-15 03:20:48.010251', 'step': 1818, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:48.039988', 'step': 1818, 'epoch': 2} {'type': 'loss', 'content': 0.04872005432844162, 'timestamp': '2025-09-15 03:20:48.042069', 'step': 1819, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:48.071859', 'step': 1819, 'epoch': 2} {'type': 'loss', 'content': 0.0038178430404514074, 'timestamp': '2025-09-15 03:20:48.095431', 'step': 1820, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:48.126232', 'step': 1820, 'epoch': 2} {'type': 'loss', 'content': 0.00633536372333765, 'timestamp': '2025-09-15 03:20:48.128206', 'step': 1821, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:48.157544', 'step': 1821, 'epoch': 2} {'type': 'loss', 'content': 0.003921452444046736, 'timestamp': '2025-09-15 03:20:48.159710', 'step': 1822, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:48.189801', 'step': 1822, 'epoch': 2} {'type': 'loss', 'content': 0.005446175578981638, 'timestamp': '2025-09-15 03:20:48.191914', 'step': 1823, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:48.221993', 'step': 1823, 'epoch': 2} {'type': 'loss', 'content': 0.027580486610531807, 'timestamp': '2025-09-15 03:20:48.245276', 'step': 1824, 'epoch': 2} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:20:48.980456', 'step': 1824, 'epoch': 2} {'type': 'pplx', 'content': 84274127.79866508, 'timestamp': '2025-09-15 03:20:48.982568', 'step': 1824, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:49.010779', 'step': 1824, 'epoch': 2} {'type': 'loss', 'content': 0.006789560429751873, 'timestamp': '2025-09-15 03:20:49.012856', 'step': 1825, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:49.043087', 'step': 1825, 'epoch': 2} {'type': 'loss', 'content': 0.00810826662927866, 'timestamp': '2025-09-15 03:20:49.045200', 'step': 1826, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:49.076599', 'step': 1826, 'epoch': 2} {'type': 'loss', 'content': 0.003023170167580247, 'timestamp': '2025-09-15 03:20:49.079771', 'step': 1827, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:49.110409', 'step': 1827, 'epoch': 2} {'type': 'loss', 'content': 0.016775641590356827, 'timestamp': '2025-09-15 03:20:49.134122', 'step': 1828, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:49.163833', 'step': 1828, 'epoch': 2} {'type': 'loss', 'content': 0.0007950032013468444, 'timestamp': '2025-09-15 03:20:49.165954', 'step': 1829, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:49.196133', 'step': 1829, 'epoch': 2} {'type': 'loss', 'content': 0.04435531049966812, 'timestamp': '2025-09-15 03:20:49.198152', 'step': 1830, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:49.228427', 'step': 1830, 'epoch': 2} {'type': 'loss', 'content': 0.0005618184222839773, 'timestamp': '2025-09-15 03:20:49.230466', 'step': 1831, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:49.260564', 'step': 1831, 'epoch': 2} {'type': 'loss', 'content': 0.002401631325483322, 'timestamp': '2025-09-15 03:20:49.284106', 'step': 1832, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:49.314240', 'step': 1832, 'epoch': 2} {'type': 'loss', 'content': 0.0023109540343284607, 'timestamp': '2025-09-15 03:20:49.316186', 'step': 1833, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:49.346486', 'step': 1833, 'epoch': 2} {'type': 'loss', 'content': 0.007046803366392851, 'timestamp': '2025-09-15 03:20:49.348641', 'step': 1834, 'epoch': 2} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:49.397598', 'step': 1834, 'epoch': 3} {'type': 'loss', 'content': 0.040310557931661606, 'timestamp': '2025-09-15 03:20:49.399958', 'step': 1835, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:49.430018', 'step': 1835, 'epoch': 3} {'type': 'loss', 'content': 0.013268978334963322, 'timestamp': '2025-09-15 03:20:49.453512', 'step': 1836, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:49.483351', 'step': 1836, 'epoch': 3} {'type': 'loss', 'content': 0.011992714367806911, 'timestamp': '2025-09-15 03:20:49.485576', 'step': 1837, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:49.515324', 'step': 1837, 'epoch': 3} {'type': 'loss', 'content': 0.004671222064644098, 'timestamp': '2025-09-15 03:20:49.517338', 'step': 1838, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:49.546913', 'step': 1838, 'epoch': 3} {'type': 'loss', 'content': 0.06716583669185638, 'timestamp': '2025-09-15 03:20:49.549067', 'step': 1839, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:49.578882', 'step': 1839, 'epoch': 3} {'type': 'loss', 'content': 0.020287616178393364, 'timestamp': '2025-09-15 03:20:49.602300', 'step': 1840, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:49.631774', 'step': 1840, 'epoch': 3} {'type': 'loss', 'content': 0.007350075524300337, 'timestamp': '2025-09-15 03:20:49.633800', 'step': 1841, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:49.663850', 'step': 1841, 'epoch': 3} {'type': 'loss', 'content': 0.001616068184375763, 'timestamp': '2025-09-15 03:20:49.665806', 'step': 1842, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:49.696464', 'step': 1842, 'epoch': 3} {'type': 'loss', 'content': 0.00043574132723733783, 'timestamp': '2025-09-15 03:20:49.698764', 'step': 1843, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:49.728942', 'step': 1843, 'epoch': 3} {'type': 'loss', 'content': 0.04232177138328552, 'timestamp': '2025-09-15 03:20:49.752383', 'step': 1844, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:49.782171', 'step': 1844, 'epoch': 3} {'type': 'loss', 'content': 0.01541460957378149, 'timestamp': '2025-09-15 03:20:49.784140', 'step': 1845, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:49.815011', 'step': 1845, 'epoch': 3} {'type': 'loss', 'content': 0.0018699252977967262, 'timestamp': '2025-09-15 03:20:49.817076', 'step': 1846, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:49.847003', 'step': 1846, 'epoch': 3} {'type': 'loss', 'content': 0.015663478523492813, 'timestamp': '2025-09-15 03:20:49.849051', 'step': 1847, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:49.879032', 'step': 1847, 'epoch': 3} {'type': 'loss', 'content': 0.029345327988266945, 'timestamp': '2025-09-15 03:20:49.902469', 'step': 1848, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:49.933206', 'step': 1848, 'epoch': 3} {'type': 'loss', 'content': 0.016897190362215042, 'timestamp': '2025-09-15 03:20:49.935474', 'step': 1849, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:49.965530', 'step': 1849, 'epoch': 3} {'type': 'loss', 'content': 0.015935799106955528, 'timestamp': '2025-09-15 03:20:49.967846', 'step': 1850, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:49.997627', 'step': 1850, 'epoch': 3} {'type': 'loss', 'content': 0.010694613680243492, 'timestamp': '2025-09-15 03:20:50.000188', 'step': 1851, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:50.029852', 'step': 1851, 'epoch': 3} {'type': 'loss', 'content': 0.0030253021977841854, 'timestamp': '2025-09-15 03:20:50.053739', 'step': 1852, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:50.084010', 'step': 1852, 'epoch': 3} {'type': 'loss', 'content': 0.022218191996216774, 'timestamp': '2025-09-15 03:20:50.086116', 'step': 1853, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:50.116063', 'step': 1853, 'epoch': 3} {'type': 'loss', 'content': 0.007684533949941397, 'timestamp': '2025-09-15 03:20:50.118237', 'step': 1854, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:50.149110', 'step': 1854, 'epoch': 3} {'type': 'loss', 'content': 0.034298017621040344, 'timestamp': '2025-09-15 03:20:50.151343', 'step': 1855, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:50.181171', 'step': 1855, 'epoch': 3} {'type': 'loss', 'content': 0.04699214920401573, 'timestamp': '2025-09-15 03:20:50.204608', 'step': 1856, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:50.234970', 'step': 1856, 'epoch': 3} {'type': 'loss', 'content': 0.008235123939812183, 'timestamp': '2025-09-15 03:20:50.237249', 'step': 1857, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:50.267000', 'step': 1857, 'epoch': 3} {'type': 'loss', 'content': 0.012289443984627724, 'timestamp': '2025-09-15 03:20:50.269110', 'step': 1858, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:50.298189', 'step': 1858, 'epoch': 3} {'type': 'loss', 'content': 0.01189574133604765, 'timestamp': '2025-09-15 03:20:50.300413', 'step': 1859, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:50.330532', 'step': 1859, 'epoch': 3} {'type': 'loss', 'content': 0.008656212128698826, 'timestamp': '2025-09-15 03:20:50.353987', 'step': 1860, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:50.384209', 'step': 1860, 'epoch': 3} {'type': 'loss', 'content': 0.00650829216465354, 'timestamp': '2025-09-15 03:20:50.386467', 'step': 1861, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:50.416154', 'step': 1861, 'epoch': 3} {'type': 'loss', 'content': 0.022483665496110916, 'timestamp': '2025-09-15 03:20:50.418519', 'step': 1862, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:50.449524', 'step': 1862, 'epoch': 3} {'type': 'loss', 'content': 0.005312159191817045, 'timestamp': '2025-09-15 03:20:50.451704', 'step': 1863, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:50.482022', 'step': 1863, 'epoch': 3} {'type': 'loss', 'content': 0.015590175986289978, 'timestamp': '2025-09-15 03:20:50.505977', 'step': 1864, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:50.536742', 'step': 1864, 'epoch': 3} {'type': 'loss', 'content': 0.015565576031804085, 'timestamp': '2025-09-15 03:20:50.538965', 'step': 1865, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:50.569106', 'step': 1865, 'epoch': 3} {'type': 'loss', 'content': 0.01493970025330782, 'timestamp': '2025-09-15 03:20:50.571767', 'step': 1866, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:50.602732', 'step': 1866, 'epoch': 3} {'type': 'loss', 'content': 0.006512957159429789, 'timestamp': '2025-09-15 03:20:50.605205', 'step': 1867, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:50.634820', 'step': 1867, 'epoch': 3} {'type': 'loss', 'content': 0.017866162583231926, 'timestamp': '2025-09-15 03:20:50.658425', 'step': 1868, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:50.688404', 'step': 1868, 'epoch': 3} {'type': 'loss', 'content': 0.00460231676697731, 'timestamp': '2025-09-15 03:20:50.690727', 'step': 1869, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:50.721143', 'step': 1869, 'epoch': 3} {'type': 'loss', 'content': 0.020427634939551353, 'timestamp': '2025-09-15 03:20:50.723349', 'step': 1870, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:50.754016', 'step': 1870, 'epoch': 3} {'type': 'loss', 'content': 0.01348375715315342, 'timestamp': '2025-09-15 03:20:50.756298', 'step': 1871, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:50.785901', 'step': 1871, 'epoch': 3} {'type': 'loss', 'content': 0.006125123240053654, 'timestamp': '2025-09-15 03:20:50.809991', 'step': 1872, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:50.840723', 'step': 1872, 'epoch': 3} {'type': 'loss', 'content': 0.010490350425243378, 'timestamp': '2025-09-15 03:20:50.843088', 'step': 1873, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:50.872991', 'step': 1873, 'epoch': 3} {'type': 'loss', 'content': 0.017887888476252556, 'timestamp': '2025-09-15 03:20:50.875133', 'step': 1874, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:50.904712', 'step': 1874, 'epoch': 3} {'type': 'loss', 'content': 0.00924895703792572, 'timestamp': '2025-09-15 03:20:50.907104', 'step': 1875, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:50.937259', 'step': 1875, 'epoch': 3} {'type': 'loss', 'content': 0.005214744247496128, 'timestamp': '2025-09-15 03:20:50.961010', 'step': 1876, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:50.991140', 'step': 1876, 'epoch': 3} {'type': 'loss', 'content': 0.009649758227169514, 'timestamp': '2025-09-15 03:20:50.993549', 'step': 1877, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:51.023860', 'step': 1877, 'epoch': 3} {'type': 'loss', 'content': 0.0009704649564810097, 'timestamp': '2025-09-15 03:20:51.026196', 'step': 1878, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:51.057399', 'step': 1878, 'epoch': 3} {'type': 'loss', 'content': 0.004649874288588762, 'timestamp': '2025-09-15 03:20:51.059650', 'step': 1879, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:51.090033', 'step': 1879, 'epoch': 3} {'type': 'loss', 'content': 0.0027771240565925837, 'timestamp': '2025-09-15 03:20:51.113694', 'step': 1880, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:51.144361', 'step': 1880, 'epoch': 3} {'type': 'loss', 'content': 0.007548298221081495, 'timestamp': '2025-09-15 03:20:51.146866', 'step': 1881, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:20:51.852012', 'step': 1881, 'epoch': 3} {'type': 'pplx', 'content': 79061968.0886773, 'timestamp': '2025-09-15 03:20:51.854071', 'step': 1881, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:51.882175', 'step': 1881, 'epoch': 3} {'type': 'loss', 'content': 0.0009704644908197224, 'timestamp': '2025-09-15 03:20:51.884140', 'step': 1882, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:51.913876', 'step': 1882, 'epoch': 3} {'type': 'loss', 'content': 0.001010966137982905, 'timestamp': '2025-09-15 03:20:51.915862', 'step': 1883, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:51.946098', 'step': 1883, 'epoch': 3} {'type': 'loss', 'content': 0.011982734315097332, 'timestamp': '2025-09-15 03:20:51.969611', 'step': 1884, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:52.000563', 'step': 1884, 'epoch': 3} {'type': 'loss', 'content': 0.0003622773219831288, 'timestamp': '2025-09-15 03:20:52.002484', 'step': 1885, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:52.031979', 'step': 1885, 'epoch': 3} {'type': 'loss', 'content': 0.0402422733604908, 'timestamp': '2025-09-15 03:20:52.034011', 'step': 1886, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:52.063552', 'step': 1886, 'epoch': 3} {'type': 'loss', 'content': 0.015572545118629932, 'timestamp': '2025-09-15 03:20:52.065598', 'step': 1887, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:52.095203', 'step': 1887, 'epoch': 3} {'type': 'loss', 'content': 0.02413473092019558, 'timestamp': '2025-09-15 03:20:52.118638', 'step': 1888, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:52.148362', 'step': 1888, 'epoch': 3} {'type': 'loss', 'content': 0.010828151367604733, 'timestamp': '2025-09-15 03:20:52.150580', 'step': 1889, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:52.181421', 'step': 1889, 'epoch': 3} {'type': 'loss', 'content': 0.010638976469635963, 'timestamp': '2025-09-15 03:20:52.183846', 'step': 1890, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:52.214797', 'step': 1890, 'epoch': 3} {'type': 'loss', 'content': 0.006136444862931967, 'timestamp': '2025-09-15 03:20:52.218356', 'step': 1891, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:52.248239', 'step': 1891, 'epoch': 3} {'type': 'loss', 'content': 0.005477941129356623, 'timestamp': '2025-09-15 03:20:52.271651', 'step': 1892, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:52.301664', 'step': 1892, 'epoch': 3} {'type': 'loss', 'content': 0.009350026026368141, 'timestamp': '2025-09-15 03:20:52.303565', 'step': 1893, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:52.333318', 'step': 1893, 'epoch': 3} {'type': 'loss', 'content': 0.03536457568407059, 'timestamp': '2025-09-15 03:20:52.335236', 'step': 1894, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:52.365100', 'step': 1894, 'epoch': 3} {'type': 'loss', 'content': 0.0048360563814640045, 'timestamp': '2025-09-15 03:20:52.367179', 'step': 1895, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:52.397509', 'step': 1895, 'epoch': 3} {'type': 'loss', 'content': 0.011292965151369572, 'timestamp': '2025-09-15 03:20:52.421025', 'step': 1896, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:52.453347', 'step': 1896, 'epoch': 3} {'type': 'loss', 'content': 0.0005236997385509312, 'timestamp': '2025-09-15 03:20:52.455319', 'step': 1897, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:52.485482', 'step': 1897, 'epoch': 3} {'type': 'loss', 'content': 0.001544496393762529, 'timestamp': '2025-09-15 03:20:52.487561', 'step': 1898, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:52.517393', 'step': 1898, 'epoch': 3} {'type': 'loss', 'content': 0.0006119143799878657, 'timestamp': '2025-09-15 03:20:52.519281', 'step': 1899, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:52.551270', 'step': 1899, 'epoch': 3} {'type': 'loss', 'content': 0.04506358131766319, 'timestamp': '2025-09-15 03:20:52.574764', 'step': 1900, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:52.604741', 'step': 1900, 'epoch': 3} {'type': 'loss', 'content': 0.0015903854509815574, 'timestamp': '2025-09-15 03:20:52.606747', 'step': 1901, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:52.636736', 'step': 1901, 'epoch': 3} {'type': 'loss', 'content': 0.0056643313728272915, 'timestamp': '2025-09-15 03:20:52.638620', 'step': 1902, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:52.668361', 'step': 1902, 'epoch': 3} {'type': 'loss', 'content': 0.012327468022704124, 'timestamp': '2025-09-15 03:20:52.670561', 'step': 1903, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:52.700271', 'step': 1903, 'epoch': 3} {'type': 'loss', 'content': 0.03373425453901291, 'timestamp': '2025-09-15 03:20:52.723818', 'step': 1904, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:52.754999', 'step': 1904, 'epoch': 3} {'type': 'loss', 'content': 0.0006273844628594816, 'timestamp': '2025-09-15 03:20:52.757167', 'step': 1905, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:52.786773', 'step': 1905, 'epoch': 3} {'type': 'loss', 'content': 0.03912340849637985, 'timestamp': '2025-09-15 03:20:52.788766', 'step': 1906, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:52.819254', 'step': 1906, 'epoch': 3} {'type': 'loss', 'content': 0.0008735408773645759, 'timestamp': '2025-09-15 03:20:52.821325', 'step': 1907, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:52.851495', 'step': 1907, 'epoch': 3} {'type': 'loss', 'content': 0.03705182299017906, 'timestamp': '2025-09-15 03:20:52.874957', 'step': 1908, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:52.904665', 'step': 1908, 'epoch': 3} {'type': 'loss', 'content': 0.011124390177428722, 'timestamp': '2025-09-15 03:20:52.906745', 'step': 1909, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:52.936488', 'step': 1909, 'epoch': 3} {'type': 'loss', 'content': 0.014945329166948795, 'timestamp': '2025-09-15 03:20:52.938666', 'step': 1910, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:52.969056', 'step': 1910, 'epoch': 3} {'type': 'loss', 'content': 0.013663768768310547, 'timestamp': '2025-09-15 03:20:52.971198', 'step': 1911, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:53.001105', 'step': 1911, 'epoch': 3} {'type': 'loss', 'content': 0.007298811338841915, 'timestamp': '2025-09-15 03:20:53.024486', 'step': 1912, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:53.056321', 'step': 1912, 'epoch': 3} {'type': 'loss', 'content': 0.00904295314103365, 'timestamp': '2025-09-15 03:20:53.058356', 'step': 1913, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:53.088290', 'step': 1913, 'epoch': 3} {'type': 'loss', 'content': 0.008594379760324955, 'timestamp': '2025-09-15 03:20:53.090518', 'step': 1914, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:53.120560', 'step': 1914, 'epoch': 3} {'type': 'loss', 'content': 0.020318692550063133, 'timestamp': '2025-09-15 03:20:53.122652', 'step': 1915, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:53.152315', 'step': 1915, 'epoch': 3} {'type': 'loss', 'content': 0.006535095628350973, 'timestamp': '2025-09-15 03:20:53.175587', 'step': 1916, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:53.205024', 'step': 1916, 'epoch': 3} {'type': 'loss', 'content': 0.003774958895519376, 'timestamp': '2025-09-15 03:20:53.207151', 'step': 1917, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:53.236931', 'step': 1917, 'epoch': 3} {'type': 'loss', 'content': 0.019767671823501587, 'timestamp': '2025-09-15 03:20:53.239046', 'step': 1918, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:53.269131', 'step': 1918, 'epoch': 3} {'type': 'loss', 'content': 0.0036115716211497784, 'timestamp': '2025-09-15 03:20:53.271448', 'step': 1919, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:53.301092', 'step': 1919, 'epoch': 3} {'type': 'loss', 'content': 0.02624139003455639, 'timestamp': '2025-09-15 03:20:53.324534', 'step': 1920, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:53.354373', 'step': 1920, 'epoch': 3} {'type': 'loss', 'content': 0.007259145379066467, 'timestamp': '2025-09-15 03:20:53.356475', 'step': 1921, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:53.387618', 'step': 1921, 'epoch': 3} {'type': 'loss', 'content': 0.00768707599490881, 'timestamp': '2025-09-15 03:20:53.390468', 'step': 1922, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:53.420646', 'step': 1922, 'epoch': 3} {'type': 'loss', 'content': 0.0268842875957489, 'timestamp': '2025-09-15 03:20:53.423167', 'step': 1923, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:53.455453', 'step': 1923, 'epoch': 3} {'type': 'loss', 'content': 0.01972857303917408, 'timestamp': '2025-09-15 03:20:53.478851', 'step': 1924, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:53.508675', 'step': 1924, 'epoch': 3} {'type': 'loss', 'content': 0.022408347576856613, 'timestamp': '2025-09-15 03:20:53.511847', 'step': 1925, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:53.541430', 'step': 1925, 'epoch': 3} {'type': 'loss', 'content': 0.02350449375808239, 'timestamp': '2025-09-15 03:20:53.543472', 'step': 1926, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:53.573169', 'step': 1926, 'epoch': 3} {'type': 'loss', 'content': 0.002779848873615265, 'timestamp': '2025-09-15 03:20:53.575300', 'step': 1927, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:53.605608', 'step': 1927, 'epoch': 3} {'type': 'loss', 'content': 0.006933249067515135, 'timestamp': '2025-09-15 03:20:53.629059', 'step': 1928, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:53.658800', 'step': 1928, 'epoch': 3} {'type': 'loss', 'content': 0.01789715513586998, 'timestamp': '2025-09-15 03:20:53.660814', 'step': 1929, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:53.691321', 'step': 1929, 'epoch': 3} {'type': 'loss', 'content': 0.049555934965610504, 'timestamp': '2025-09-15 03:20:53.693338', 'step': 1930, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:53.723340', 'step': 1930, 'epoch': 3} {'type': 'loss', 'content': 0.009596183896064758, 'timestamp': '2025-09-15 03:20:53.725364', 'step': 1931, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:53.755034', 'step': 1931, 'epoch': 3} {'type': 'loss', 'content': 0.004284258931875229, 'timestamp': '2025-09-15 03:20:53.778474', 'step': 1932, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:53.807925', 'step': 1932, 'epoch': 3} {'type': 'loss', 'content': 0.001384135102853179, 'timestamp': '2025-09-15 03:20:53.810121', 'step': 1933, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:53.839919', 'step': 1933, 'epoch': 3} {'type': 'loss', 'content': 0.00989693682640791, 'timestamp': '2025-09-15 03:20:53.842138', 'step': 1934, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:53.871865', 'step': 1934, 'epoch': 3} {'type': 'loss', 'content': 0.0024838608223944902, 'timestamp': '2025-09-15 03:20:53.873925', 'step': 1935, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:53.903458', 'step': 1935, 'epoch': 3} {'type': 'loss', 'content': 0.007661298383027315, 'timestamp': '2025-09-15 03:20:53.927356', 'step': 1936, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:53.957124', 'step': 1936, 'epoch': 3} {'type': 'loss', 'content': 0.0049940976314246655, 'timestamp': '2025-09-15 03:20:53.959512', 'step': 1937, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:53.988953', 'step': 1937, 'epoch': 3} {'type': 'loss', 'content': 0.005934380926191807, 'timestamp': '2025-09-15 03:20:53.991356', 'step': 1938, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:20:54.705074', 'step': 1938, 'epoch': 3} {'type': 'pplx', 'content': 52201116.26590185, 'timestamp': '2025-09-15 03:20:54.707300', 'step': 1938, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:54.735523', 'step': 1938, 'epoch': 3} {'type': 'loss', 'content': 0.03174247592687607, 'timestamp': '2025-09-15 03:20:54.737647', 'step': 1939, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:54.767669', 'step': 1939, 'epoch': 3} {'type': 'loss', 'content': 0.002668730914592743, 'timestamp': '2025-09-15 03:20:54.791277', 'step': 1940, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:54.821190', 'step': 1940, 'epoch': 3} {'type': 'loss', 'content': 0.006054393015801907, 'timestamp': '2025-09-15 03:20:54.823585', 'step': 1941, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:54.854030', 'step': 1941, 'epoch': 3} {'type': 'loss', 'content': 0.009095175191760063, 'timestamp': '2025-09-15 03:20:54.856439', 'step': 1942, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:54.886495', 'step': 1942, 'epoch': 3} {'type': 'loss', 'content': 0.029130371287465096, 'timestamp': '2025-09-15 03:20:54.888255', 'step': 1943, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:20:54.917756', 'step': 1943, 'epoch': 3} {'type': 'loss', 'content': 0.040686529129743576, 'timestamp': '2025-09-15 03:20:54.941375', 'step': 1944, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:54.971974', 'step': 1944, 'epoch': 3} {'type': 'loss', 'content': 0.0014195304829627275, 'timestamp': '2025-09-15 03:20:54.974304', 'step': 1945, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:55.005882', 'step': 1945, 'epoch': 3} {'type': 'loss', 'content': 0.009091906249523163, 'timestamp': '2025-09-15 03:20:55.008082', 'step': 1946, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:55.038113', 'step': 1946, 'epoch': 3} {'type': 'loss', 'content': 0.0034668713342398405, 'timestamp': '2025-09-15 03:20:55.040294', 'step': 1947, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:55.070322', 'step': 1947, 'epoch': 3} {'type': 'loss', 'content': 0.0020099026151001453, 'timestamp': '2025-09-15 03:20:55.093940', 'step': 1948, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:55.124139', 'step': 1948, 'epoch': 3} {'type': 'loss', 'content': 0.006301121320575476, 'timestamp': '2025-09-15 03:20:55.126148', 'step': 1949, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:55.156304', 'step': 1949, 'epoch': 3} {'type': 'loss', 'content': 0.008950197137892246, 'timestamp': '2025-09-15 03:20:55.159855', 'step': 1950, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:55.190260', 'step': 1950, 'epoch': 3} {'type': 'loss', 'content': 0.010447906330227852, 'timestamp': '2025-09-15 03:20:55.192278', 'step': 1951, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:55.222749', 'step': 1951, 'epoch': 3} {'type': 'loss', 'content': 0.016328440979123116, 'timestamp': '2025-09-15 03:20:55.246128', 'step': 1952, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:55.275743', 'step': 1952, 'epoch': 3} {'type': 'loss', 'content': 0.0013112931046634912, 'timestamp': '2025-09-15 03:20:55.277433', 'step': 1953, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:55.306937', 'step': 1953, 'epoch': 3} {'type': 'loss', 'content': 0.002939864993095398, 'timestamp': '2025-09-15 03:20:55.309128', 'step': 1954, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:55.338856', 'step': 1954, 'epoch': 3} {'type': 'loss', 'content': 0.04834269359707832, 'timestamp': '2025-09-15 03:20:55.340701', 'step': 1955, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:55.370272', 'step': 1955, 'epoch': 3} {'type': 'loss', 'content': 0.017391914501786232, 'timestamp': '2025-09-15 03:20:55.393722', 'step': 1956, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:55.424170', 'step': 1956, 'epoch': 3} {'type': 'loss', 'content': 0.009930714964866638, 'timestamp': '2025-09-15 03:20:55.426034', 'step': 1957, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:55.455602', 'step': 1957, 'epoch': 3} {'type': 'loss', 'content': 0.004469075705856085, 'timestamp': '2025-09-15 03:20:55.458941', 'step': 1958, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:55.488991', 'step': 1958, 'epoch': 3} {'type': 'loss', 'content': 0.0024031256325542927, 'timestamp': '2025-09-15 03:20:55.491420', 'step': 1959, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:55.521239', 'step': 1959, 'epoch': 3} {'type': 'loss', 'content': 0.03323809802532196, 'timestamp': '2025-09-15 03:20:55.544528', 'step': 1960, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:55.574723', 'step': 1960, 'epoch': 3} {'type': 'loss', 'content': 0.002740835305303335, 'timestamp': '2025-09-15 03:20:55.576837', 'step': 1961, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:55.607287', 'step': 1961, 'epoch': 3} {'type': 'loss', 'content': 0.0028987224213778973, 'timestamp': '2025-09-15 03:20:55.609454', 'step': 1962, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:55.639352', 'step': 1962, 'epoch': 3} {'type': 'loss', 'content': 0.020512523129582405, 'timestamp': '2025-09-15 03:20:55.641434', 'step': 1963, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:55.671263', 'step': 1963, 'epoch': 3} {'type': 'loss', 'content': 0.04198675602674484, 'timestamp': '2025-09-15 03:20:55.694909', 'step': 1964, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:55.724696', 'step': 1964, 'epoch': 3} {'type': 'loss', 'content': 0.006624828092753887, 'timestamp': '2025-09-15 03:20:55.727072', 'step': 1965, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:55.757139', 'step': 1965, 'epoch': 3} {'type': 'loss', 'content': 0.002662785816937685, 'timestamp': '2025-09-15 03:20:55.759359', 'step': 1966, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:55.790281', 'step': 1966, 'epoch': 3} {'type': 'loss', 'content': 0.003230338217690587, 'timestamp': '2025-09-15 03:20:55.792494', 'step': 1967, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:55.822213', 'step': 1967, 'epoch': 3} {'type': 'loss', 'content': 0.017028162255883217, 'timestamp': '2025-09-15 03:20:55.845573', 'step': 1968, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:55.875760', 'step': 1968, 'epoch': 3} {'type': 'loss', 'content': 0.0020218912977725267, 'timestamp': '2025-09-15 03:20:55.877705', 'step': 1969, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:55.908857', 'step': 1969, 'epoch': 3} {'type': 'loss', 'content': 0.010596668347716331, 'timestamp': '2025-09-15 03:20:55.910785', 'step': 1970, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:55.941058', 'step': 1970, 'epoch': 3} {'type': 'loss', 'content': 0.0063810343854129314, 'timestamp': '2025-09-15 03:20:55.943094', 'step': 1971, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:55.974331', 'step': 1971, 'epoch': 3} {'type': 'loss', 'content': 0.024318158626556396, 'timestamp': '2025-09-15 03:20:55.997815', 'step': 1972, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:56.027461', 'step': 1972, 'epoch': 3} {'type': 'loss', 'content': 0.014820395037531853, 'timestamp': '2025-09-15 03:20:56.029277', 'step': 1973, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:56.059371', 'step': 1973, 'epoch': 3} {'type': 'loss', 'content': 0.007273904979228973, 'timestamp': '2025-09-15 03:20:56.061585', 'step': 1974, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:56.092700', 'step': 1974, 'epoch': 3} {'type': 'loss', 'content': 0.0198097825050354, 'timestamp': '2025-09-15 03:20:56.094906', 'step': 1975, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:56.125797', 'step': 1975, 'epoch': 3} {'type': 'loss', 'content': 0.010190504603087902, 'timestamp': '2025-09-15 03:20:56.149316', 'step': 1976, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:56.179098', 'step': 1976, 'epoch': 3} {'type': 'loss', 'content': 0.005385191645473242, 'timestamp': '2025-09-15 03:20:56.180907', 'step': 1977, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:56.210819', 'step': 1977, 'epoch': 3} {'type': 'loss', 'content': 0.011977910064160824, 'timestamp': '2025-09-15 03:20:56.213146', 'step': 1978, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:56.243537', 'step': 1978, 'epoch': 3} {'type': 'loss', 'content': 0.013383844867348671, 'timestamp': '2025-09-15 03:20:56.245460', 'step': 1979, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:56.275859', 'step': 1979, 'epoch': 3} {'type': 'loss', 'content': 0.0016832282999530435, 'timestamp': '2025-09-15 03:20:56.299044', 'step': 1980, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:56.329750', 'step': 1980, 'epoch': 3} {'type': 'loss', 'content': 0.004006501287221909, 'timestamp': '2025-09-15 03:20:56.331897', 'step': 1981, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:56.362176', 'step': 1981, 'epoch': 3} {'type': 'loss', 'content': 0.009227439761161804, 'timestamp': '2025-09-15 03:20:56.364032', 'step': 1982, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:56.394255', 'step': 1982, 'epoch': 3} {'type': 'loss', 'content': 0.0073464675806462765, 'timestamp': '2025-09-15 03:20:56.396433', 'step': 1983, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:56.428168', 'step': 1983, 'epoch': 3} {'type': 'loss', 'content': 0.0061258794739842415, 'timestamp': '2025-09-15 03:20:56.451663', 'step': 1984, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:56.482146', 'step': 1984, 'epoch': 3} {'type': 'loss', 'content': 0.0024023440200835466, 'timestamp': '2025-09-15 03:20:56.484190', 'step': 1985, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:56.514611', 'step': 1985, 'epoch': 3} {'type': 'loss', 'content': 0.001554057002067566, 'timestamp': '2025-09-15 03:20:56.516678', 'step': 1986, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:56.549468', 'step': 1986, 'epoch': 3} {'type': 'loss', 'content': 0.02349625527858734, 'timestamp': '2025-09-15 03:20:56.551484', 'step': 1987, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:56.581914', 'step': 1987, 'epoch': 3} {'type': 'loss', 'content': 0.015915708616375923, 'timestamp': '2025-09-15 03:20:56.605227', 'step': 1988, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:56.635639', 'step': 1988, 'epoch': 3} {'type': 'loss', 'content': 0.0070765577256679535, 'timestamp': '2025-09-15 03:20:56.637897', 'step': 1989, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:56.667794', 'step': 1989, 'epoch': 3} {'type': 'loss', 'content': 0.0020205960609018803, 'timestamp': '2025-09-15 03:20:56.669569', 'step': 1990, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:56.699597', 'step': 1990, 'epoch': 3} {'type': 'loss', 'content': 0.03761409968137741, 'timestamp': '2025-09-15 03:20:56.701466', 'step': 1991, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:56.732875', 'step': 1991, 'epoch': 3} {'type': 'loss', 'content': 0.027002081274986267, 'timestamp': '2025-09-15 03:20:56.756203', 'step': 1992, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:56.786469', 'step': 1992, 'epoch': 3} {'type': 'loss', 'content': 0.05631239339709282, 'timestamp': '2025-09-15 03:20:56.788464', 'step': 1993, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:56.818334', 'step': 1993, 'epoch': 3} {'type': 'loss', 'content': 0.009488861076533794, 'timestamp': '2025-09-15 03:20:56.820565', 'step': 1994, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:56.851031', 'step': 1994, 'epoch': 3} {'type': 'loss', 'content': 0.0019661146216094494, 'timestamp': '2025-09-15 03:20:56.853077', 'step': 1995, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:20:57.584519', 'step': 1995, 'epoch': 3} {'type': 'pplx', 'content': 52256625.34364043, 'timestamp': '2025-09-15 03:20:57.587161', 'step': 1995, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:57.616644', 'step': 1995, 'epoch': 3} {'type': 'loss', 'content': 0.0030057637486606836, 'timestamp': '2025-09-15 03:20:57.639958', 'step': 1996, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:57.670232', 'step': 1996, 'epoch': 3} {'type': 'loss', 'content': 0.0027218328323215246, 'timestamp': '2025-09-15 03:20:57.672073', 'step': 1997, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:20:57.702246', 'step': 1997, 'epoch': 3} {'type': 'loss', 'content': 0.012235582806169987, 'timestamp': '2025-09-15 03:20:57.704160', 'step': 1998, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:20:57.734468', 'step': 1998, 'epoch': 3} {'type': 'loss', 'content': 0.013591994531452656, 'timestamp': '2025-09-15 03:20:57.736468', 'step': 1999, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:20:57.766900', 'step': 1999, 'epoch': 3} {'type': 'loss', 'content': 0.013293951749801636, 'timestamp': '2025-09-15 03:20:57.790317', 'step': 2000, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 2000', 'timestamp': '2025-09-15 03:21:04.244838', 'step': 2000, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:04.279604', 'step': 2000, 'epoch': 3} {'type': 'loss', 'content': 0.01585422083735466, 'timestamp': '2025-09-15 03:21:04.281723', 'step': 2001, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:04.313622', 'step': 2001, 'epoch': 3} {'type': 'loss', 'content': 0.001507807755842805, 'timestamp': '2025-09-15 03:21:04.315694', 'step': 2002, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:04.346442', 'step': 2002, 'epoch': 3} {'type': 'loss', 'content': 0.0021584215573966503, 'timestamp': '2025-09-15 03:21:04.348477', 'step': 2003, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:04.378328', 'step': 2003, 'epoch': 3} {'type': 'loss', 'content': 0.04452117905020714, 'timestamp': '2025-09-15 03:21:04.401911', 'step': 2004, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:04.432932', 'step': 2004, 'epoch': 3} {'type': 'loss', 'content': 0.0012962737819179893, 'timestamp': '2025-09-15 03:21:04.434908', 'step': 2005, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:04.465187', 'step': 2005, 'epoch': 3} {'type': 'loss', 'content': 0.02925196662545204, 'timestamp': '2025-09-15 03:21:04.467299', 'step': 2006, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:04.497516', 'step': 2006, 'epoch': 3} {'type': 'loss', 'content': 0.006457092706114054, 'timestamp': '2025-09-15 03:21:04.499404', 'step': 2007, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:04.530176', 'step': 2007, 'epoch': 3} {'type': 'loss', 'content': 0.014313125051558018, 'timestamp': '2025-09-15 03:21:04.553819', 'step': 2008, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:04.585146', 'step': 2008, 'epoch': 3} {'type': 'loss', 'content': 0.007027280982583761, 'timestamp': '2025-09-15 03:21:04.587228', 'step': 2009, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:04.618126', 'step': 2009, 'epoch': 3} {'type': 'loss', 'content': 0.015289964154362679, 'timestamp': '2025-09-15 03:21:04.620200', 'step': 2010, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:04.650770', 'step': 2010, 'epoch': 3} {'type': 'loss', 'content': 0.006559719797223806, 'timestamp': '2025-09-15 03:21:04.652852', 'step': 2011, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:04.684000', 'step': 2011, 'epoch': 3} {'type': 'loss', 'content': 0.004909235052764416, 'timestamp': '2025-09-15 03:21:04.707783', 'step': 2012, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:04.738585', 'step': 2012, 'epoch': 3} {'type': 'loss', 'content': 0.002195314271375537, 'timestamp': '2025-09-15 03:21:04.740815', 'step': 2013, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:04.771585', 'step': 2013, 'epoch': 3} {'type': 'loss', 'content': 0.003565679071471095, 'timestamp': '2025-09-15 03:21:04.773869', 'step': 2014, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:04.805568', 'step': 2014, 'epoch': 3} {'type': 'loss', 'content': 0.028394032269716263, 'timestamp': '2025-09-15 03:21:04.807914', 'step': 2015, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:04.837639', 'step': 2015, 'epoch': 3} {'type': 'loss', 'content': 0.012641467154026031, 'timestamp': '2025-09-15 03:21:04.861599', 'step': 2016, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:04.892180', 'step': 2016, 'epoch': 3} {'type': 'loss', 'content': 0.005878259893506765, 'timestamp': '2025-09-15 03:21:04.894417', 'step': 2017, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:04.925315', 'step': 2017, 'epoch': 3} {'type': 'loss', 'content': 0.011266002431511879, 'timestamp': '2025-09-15 03:21:04.927410', 'step': 2018, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:04.958086', 'step': 2018, 'epoch': 3} {'type': 'loss', 'content': 0.0023640652652829885, 'timestamp': '2025-09-15 03:21:04.960093', 'step': 2019, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:04.991079', 'step': 2019, 'epoch': 3} {'type': 'loss', 'content': 0.007986019365489483, 'timestamp': '2025-09-15 03:21:05.014635', 'step': 2020, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:05.045562', 'step': 2020, 'epoch': 3} {'type': 'loss', 'content': 0.002925436245277524, 'timestamp': '2025-09-15 03:21:05.048057', 'step': 2021, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:05.079498', 'step': 2021, 'epoch': 3} {'type': 'loss', 'content': 0.005862175952643156, 'timestamp': '2025-09-15 03:21:05.082027', 'step': 2022, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:05.113280', 'step': 2022, 'epoch': 3} {'type': 'loss', 'content': 0.0018555274000391364, 'timestamp': '2025-09-15 03:21:05.115406', 'step': 2023, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:05.145494', 'step': 2023, 'epoch': 3} {'type': 'loss', 'content': 0.01033469382673502, 'timestamp': '2025-09-15 03:21:05.168989', 'step': 2024, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:05.199517', 'step': 2024, 'epoch': 3} {'type': 'loss', 'content': 0.016592005267739296, 'timestamp': '2025-09-15 03:21:05.201697', 'step': 2025, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:05.232272', 'step': 2025, 'epoch': 3} {'type': 'loss', 'content': 0.0015080816810950637, 'timestamp': '2025-09-15 03:21:05.234420', 'step': 2026, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:05.265099', 'step': 2026, 'epoch': 3} {'type': 'loss', 'content': 0.002308989642187953, 'timestamp': '2025-09-15 03:21:05.267237', 'step': 2027, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:05.298232', 'step': 2027, 'epoch': 3} {'type': 'loss', 'content': 0.005254893563687801, 'timestamp': '2025-09-15 03:21:05.322029', 'step': 2028, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:05.352870', 'step': 2028, 'epoch': 3} {'type': 'loss', 'content': 0.02585950866341591, 'timestamp': '2025-09-15 03:21:05.355003', 'step': 2029, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:05.385641', 'step': 2029, 'epoch': 3} {'type': 'loss', 'content': 0.017035724595189095, 'timestamp': '2025-09-15 03:21:05.387615', 'step': 2030, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:05.418267', 'step': 2030, 'epoch': 3} {'type': 'loss', 'content': 0.0030646901577711105, 'timestamp': '2025-09-15 03:21:05.420386', 'step': 2031, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:05.451148', 'step': 2031, 'epoch': 3} {'type': 'loss', 'content': 0.005517587065696716, 'timestamp': '2025-09-15 03:21:05.474646', 'step': 2032, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:05.505339', 'step': 2032, 'epoch': 3} {'type': 'loss', 'content': 0.015570155344903469, 'timestamp': '2025-09-15 03:21:05.507452', 'step': 2033, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:05.537443', 'step': 2033, 'epoch': 3} {'type': 'loss', 'content': 0.0066670882515609264, 'timestamp': '2025-09-15 03:21:05.539505', 'step': 2034, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:05.569512', 'step': 2034, 'epoch': 3} {'type': 'loss', 'content': 0.0035194784868508577, 'timestamp': '2025-09-15 03:21:05.571683', 'step': 2035, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:05.602120', 'step': 2035, 'epoch': 3} {'type': 'loss', 'content': 0.015802303329110146, 'timestamp': '2025-09-15 03:21:05.625593', 'step': 2036, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:05.658084', 'step': 2036, 'epoch': 3} {'type': 'loss', 'content': 0.008249717764556408, 'timestamp': '2025-09-15 03:21:05.660321', 'step': 2037, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:05.690915', 'step': 2037, 'epoch': 3} {'type': 'loss', 'content': 0.02149888686835766, 'timestamp': '2025-09-15 03:21:05.692955', 'step': 2038, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:05.723767', 'step': 2038, 'epoch': 3} {'type': 'loss', 'content': 0.004375663120299578, 'timestamp': '2025-09-15 03:21:05.726164', 'step': 2039, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:05.757205', 'step': 2039, 'epoch': 3} {'type': 'loss', 'content': 0.0018813589122146368, 'timestamp': '2025-09-15 03:21:05.780917', 'step': 2040, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:05.811556', 'step': 2040, 'epoch': 3} {'type': 'loss', 'content': 0.012875386513769627, 'timestamp': '2025-09-15 03:21:05.813683', 'step': 2041, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:05.843697', 'step': 2041, 'epoch': 3} {'type': 'loss', 'content': 0.0009363238350488245, 'timestamp': '2025-09-15 03:21:05.846413', 'step': 2042, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:05.882725', 'step': 2042, 'epoch': 3} {'type': 'loss', 'content': 0.013507531024515629, 'timestamp': '2025-09-15 03:21:05.884925', 'step': 2043, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:05.915336', 'step': 2043, 'epoch': 3} {'type': 'loss', 'content': 0.02033652924001217, 'timestamp': '2025-09-15 03:21:05.939133', 'step': 2044, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:05.969433', 'step': 2044, 'epoch': 3} {'type': 'loss', 'content': 0.004257784225046635, 'timestamp': '2025-09-15 03:21:05.971495', 'step': 2045, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:06.001563', 'step': 2045, 'epoch': 3} {'type': 'loss', 'content': 0.04417576640844345, 'timestamp': '2025-09-15 03:21:06.003604', 'step': 2046, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:06.033828', 'step': 2046, 'epoch': 3} {'type': 'loss', 'content': 0.01268736831843853, 'timestamp': '2025-09-15 03:21:06.035902', 'step': 2047, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:06.066219', 'step': 2047, 'epoch': 3} {'type': 'loss', 'content': 0.0059195528738200665, 'timestamp': '2025-09-15 03:21:06.090609', 'step': 2048, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:06.122957', 'step': 2048, 'epoch': 3} {'type': 'loss', 'content': 0.002530893078073859, 'timestamp': '2025-09-15 03:21:06.124928', 'step': 2049, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:06.155336', 'step': 2049, 'epoch': 3} {'type': 'loss', 'content': 0.007208889815956354, 'timestamp': '2025-09-15 03:21:06.157413', 'step': 2050, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:06.188975', 'step': 2050, 'epoch': 3} {'type': 'loss', 'content': 0.007339324336498976, 'timestamp': '2025-09-15 03:21:06.191223', 'step': 2051, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:06.222004', 'step': 2051, 'epoch': 3} {'type': 'loss', 'content': 0.031263567507267, 'timestamp': '2025-09-15 03:21:06.245643', 'step': 2052, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:21:06.985869', 'step': 2052, 'epoch': 3} {'type': 'pplx', 'content': 59827417.800061285, 'timestamp': '2025-09-15 03:21:06.988010', 'step': 2052, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:07.017171', 'step': 2052, 'epoch': 3} {'type': 'loss', 'content': 0.004454520996659994, 'timestamp': '2025-09-15 03:21:07.019431', 'step': 2053, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:07.051489', 'step': 2053, 'epoch': 3} {'type': 'loss', 'content': 0.003587186336517334, 'timestamp': '2025-09-15 03:21:07.053563', 'step': 2054, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:07.084326', 'step': 2054, 'epoch': 3} {'type': 'loss', 'content': 0.005687421653419733, 'timestamp': '2025-09-15 03:21:07.086619', 'step': 2055, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:07.117312', 'step': 2055, 'epoch': 3} {'type': 'loss', 'content': 0.0027272473089396954, 'timestamp': '2025-09-15 03:21:07.141245', 'step': 2056, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:07.172840', 'step': 2056, 'epoch': 3} {'type': 'loss', 'content': 0.015834230929613113, 'timestamp': '2025-09-15 03:21:07.174912', 'step': 2057, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:07.205655', 'step': 2057, 'epoch': 3} {'type': 'loss', 'content': 0.002714097034186125, 'timestamp': '2025-09-15 03:21:07.208618', 'step': 2058, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:07.238702', 'step': 2058, 'epoch': 3} {'type': 'loss', 'content': 0.005063068121671677, 'timestamp': '2025-09-15 03:21:07.240817', 'step': 2059, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:07.271865', 'step': 2059, 'epoch': 3} {'type': 'loss', 'content': 0.003443613648414612, 'timestamp': '2025-09-15 03:21:07.295851', 'step': 2060, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:07.326844', 'step': 2060, 'epoch': 3} {'type': 'loss', 'content': 0.0057801674120128155, 'timestamp': '2025-09-15 03:21:07.328860', 'step': 2061, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:07.360331', 'step': 2061, 'epoch': 3} {'type': 'loss', 'content': 0.0011709003010764718, 'timestamp': '2025-09-15 03:21:07.362481', 'step': 2062, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:07.392641', 'step': 2062, 'epoch': 3} {'type': 'loss', 'content': 0.016910651698708534, 'timestamp': '2025-09-15 03:21:07.395025', 'step': 2063, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:07.425973', 'step': 2063, 'epoch': 3} {'type': 'loss', 'content': 0.007694550324231386, 'timestamp': '2025-09-15 03:21:07.449471', 'step': 2064, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:07.481036', 'step': 2064, 'epoch': 3} {'type': 'loss', 'content': 0.0011027039727196097, 'timestamp': '2025-09-15 03:21:07.483245', 'step': 2065, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:07.514017', 'step': 2065, 'epoch': 3} {'type': 'loss', 'content': 0.006830222904682159, 'timestamp': '2025-09-15 03:21:07.516068', 'step': 2066, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:07.546654', 'step': 2066, 'epoch': 3} {'type': 'loss', 'content': 0.014835420064628124, 'timestamp': '2025-09-15 03:21:07.548709', 'step': 2067, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:07.579588', 'step': 2067, 'epoch': 3} {'type': 'loss', 'content': 0.02849721722304821, 'timestamp': '2025-09-15 03:21:07.603145', 'step': 2068, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:07.634053', 'step': 2068, 'epoch': 3} {'type': 'loss', 'content': 0.0010745684849098325, 'timestamp': '2025-09-15 03:21:07.636202', 'step': 2069, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:07.666979', 'step': 2069, 'epoch': 3} {'type': 'loss', 'content': 0.009703867137432098, 'timestamp': '2025-09-15 03:21:07.669452', 'step': 2070, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:07.700172', 'step': 2070, 'epoch': 3} {'type': 'loss', 'content': 0.0004812530241906643, 'timestamp': '2025-09-15 03:21:07.702361', 'step': 2071, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:07.732785', 'step': 2071, 'epoch': 3} {'type': 'loss', 'content': 0.026970041915774345, 'timestamp': '2025-09-15 03:21:07.756546', 'step': 2072, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:07.788069', 'step': 2072, 'epoch': 3} {'type': 'loss', 'content': 0.01335603091865778, 'timestamp': '2025-09-15 03:21:07.790299', 'step': 2073, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:07.820655', 'step': 2073, 'epoch': 3} {'type': 'loss', 'content': 0.0024553367402404547, 'timestamp': '2025-09-15 03:21:07.822901', 'step': 2074, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:07.853406', 'step': 2074, 'epoch': 3} {'type': 'loss', 'content': 0.0027745079714804888, 'timestamp': '2025-09-15 03:21:07.855520', 'step': 2075, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:07.885728', 'step': 2075, 'epoch': 3} {'type': 'loss', 'content': 0.00588145712390542, 'timestamp': '2025-09-15 03:21:07.909403', 'step': 2076, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:07.940245', 'step': 2076, 'epoch': 3} {'type': 'loss', 'content': 0.004154204856604338, 'timestamp': '2025-09-15 03:21:07.942235', 'step': 2077, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:07.973921', 'step': 2077, 'epoch': 3} {'type': 'loss', 'content': 0.024734104052186012, 'timestamp': '2025-09-15 03:21:07.975979', 'step': 2078, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:08.006119', 'step': 2078, 'epoch': 3} {'type': 'loss', 'content': 0.020028170198202133, 'timestamp': '2025-09-15 03:21:08.007851', 'step': 2079, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:08.038062', 'step': 2079, 'epoch': 3} {'type': 'loss', 'content': 0.005237225443124771, 'timestamp': '2025-09-15 03:21:08.061562', 'step': 2080, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:08.091861', 'step': 2080, 'epoch': 3} {'type': 'loss', 'content': 0.006769266445189714, 'timestamp': '2025-09-15 03:21:08.094052', 'step': 2081, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:08.124369', 'step': 2081, 'epoch': 3} {'type': 'loss', 'content': 0.0024314168840646744, 'timestamp': '2025-09-15 03:21:08.126294', 'step': 2082, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:08.156660', 'step': 2082, 'epoch': 3} {'type': 'loss', 'content': 0.008758151903748512, 'timestamp': '2025-09-15 03:21:08.158628', 'step': 2083, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:08.189137', 'step': 2083, 'epoch': 3} {'type': 'loss', 'content': 0.001061967690475285, 'timestamp': '2025-09-15 03:21:08.212826', 'step': 2084, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:08.244237', 'step': 2084, 'epoch': 3} {'type': 'loss', 'content': 0.052610017359256744, 'timestamp': '2025-09-15 03:21:08.246563', 'step': 2085, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:08.278921', 'step': 2085, 'epoch': 3} {'type': 'loss', 'content': 0.011462748982012272, 'timestamp': '2025-09-15 03:21:08.281078', 'step': 2086, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:08.311628', 'step': 2086, 'epoch': 3} {'type': 'loss', 'content': 0.005782654043287039, 'timestamp': '2025-09-15 03:21:08.313898', 'step': 2087, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:08.345191', 'step': 2087, 'epoch': 3} {'type': 'loss', 'content': 0.008149498142302036, 'timestamp': '2025-09-15 03:21:08.368821', 'step': 2088, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:08.399636', 'step': 2088, 'epoch': 3} {'type': 'loss', 'content': 0.001337492954917252, 'timestamp': '2025-09-15 03:21:08.401922', 'step': 2089, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:21:08.432968', 'step': 2089, 'epoch': 3} {'type': 'loss', 'content': 0.0016679230611771345, 'timestamp': '2025-09-15 03:21:08.435273', 'step': 2090, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:08.465504', 'step': 2090, 'epoch': 3} {'type': 'loss', 'content': 0.001318950904533267, 'timestamp': '2025-09-15 03:21:08.467623', 'step': 2091, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:08.498023', 'step': 2091, 'epoch': 3} {'type': 'loss', 'content': 0.007607594132423401, 'timestamp': '2025-09-15 03:21:08.521602', 'step': 2092, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:08.553684', 'step': 2092, 'epoch': 3} {'type': 'loss', 'content': 0.0025145213585346937, 'timestamp': '2025-09-15 03:21:08.555960', 'step': 2093, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:08.587335', 'step': 2093, 'epoch': 3} {'type': 'loss', 'content': 0.002167313126847148, 'timestamp': '2025-09-15 03:21:08.592067', 'step': 2094, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:08.625875', 'step': 2094, 'epoch': 3} {'type': 'loss', 'content': 0.0023553003557026386, 'timestamp': '2025-09-15 03:21:08.630849', 'step': 2095, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:08.664530', 'step': 2095, 'epoch': 3} {'type': 'loss', 'content': 0.04698661342263222, 'timestamp': '2025-09-15 03:21:08.688042', 'step': 2096, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:08.718577', 'step': 2096, 'epoch': 3} {'type': 'loss', 'content': 0.0016108545241877437, 'timestamp': '2025-09-15 03:21:08.720747', 'step': 2097, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:08.750979', 'step': 2097, 'epoch': 3} {'type': 'loss', 'content': 0.05345345288515091, 'timestamp': '2025-09-15 03:21:08.753657', 'step': 2098, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:08.785157', 'step': 2098, 'epoch': 3} {'type': 'loss', 'content': 0.017849748954176903, 'timestamp': '2025-09-15 03:21:08.787350', 'step': 2099, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:08.817721', 'step': 2099, 'epoch': 3} {'type': 'loss', 'content': 0.005959447007626295, 'timestamp': '2025-09-15 03:21:08.841043', 'step': 2100, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:08.872160', 'step': 2100, 'epoch': 3} {'type': 'loss', 'content': 0.0007292155059985816, 'timestamp': '2025-09-15 03:21:08.874250', 'step': 2101, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:08.904928', 'step': 2101, 'epoch': 3} {'type': 'loss', 'content': 0.0012959071900695562, 'timestamp': '2025-09-15 03:21:08.906997', 'step': 2102, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:08.937128', 'step': 2102, 'epoch': 3} {'type': 'loss', 'content': 0.01766919530928135, 'timestamp': '2025-09-15 03:21:08.939101', 'step': 2103, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:08.969575', 'step': 2103, 'epoch': 3} {'type': 'loss', 'content': 0.0009708349825814366, 'timestamp': '2025-09-15 03:21:08.993311', 'step': 2104, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:09.023627', 'step': 2104, 'epoch': 3} {'type': 'loss', 'content': 0.0029763688798993826, 'timestamp': '2025-09-15 03:21:09.025585', 'step': 2105, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:09.055945', 'step': 2105, 'epoch': 3} {'type': 'loss', 'content': 0.0015365873696282506, 'timestamp': '2025-09-15 03:21:09.057913', 'step': 2106, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:09.089373', 'step': 2106, 'epoch': 3} {'type': 'loss', 'content': 0.005631761159747839, 'timestamp': '2025-09-15 03:21:09.091425', 'step': 2107, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:09.122067', 'step': 2107, 'epoch': 3} {'type': 'loss', 'content': 0.008225338533520699, 'timestamp': '2025-09-15 03:21:09.145599', 'step': 2108, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:09.176704', 'step': 2108, 'epoch': 3} {'type': 'loss', 'content': 0.017039282247424126, 'timestamp': '2025-09-15 03:21:09.178813', 'step': 2109, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:21:09.912101', 'step': 2109, 'epoch': 3} {'type': 'pplx', 'content': 61278679.55301822, 'timestamp': '2025-09-15 03:21:09.914296', 'step': 2109, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:09.943144', 'step': 2109, 'epoch': 3} {'type': 'loss', 'content': 0.00293324189260602, 'timestamp': '2025-09-15 03:21:09.945284', 'step': 2110, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:09.976065', 'step': 2110, 'epoch': 3} {'type': 'loss', 'content': 0.0002962352300528437, 'timestamp': '2025-09-15 03:21:09.978298', 'step': 2111, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:10.008554', 'step': 2111, 'epoch': 3} {'type': 'loss', 'content': 0.010974375531077385, 'timestamp': '2025-09-15 03:21:10.032052', 'step': 2112, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:10.062926', 'step': 2112, 'epoch': 3} {'type': 'loss', 'content': 0.0032788703683763742, 'timestamp': '2025-09-15 03:21:10.064861', 'step': 2113, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:10.095904', 'step': 2113, 'epoch': 3} {'type': 'loss', 'content': 0.028654035180807114, 'timestamp': '2025-09-15 03:21:10.098187', 'step': 2114, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:10.129439', 'step': 2114, 'epoch': 3} {'type': 'loss', 'content': 0.0012286610435694456, 'timestamp': '2025-09-15 03:21:10.131764', 'step': 2115, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:10.162138', 'step': 2115, 'epoch': 3} {'type': 'loss', 'content': 0.0076230731792747974, 'timestamp': '2025-09-15 03:21:10.185656', 'step': 2116, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:10.216188', 'step': 2116, 'epoch': 3} {'type': 'loss', 'content': 0.0033180455211549997, 'timestamp': '2025-09-15 03:21:10.218154', 'step': 2117, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:10.248299', 'step': 2117, 'epoch': 3} {'type': 'loss', 'content': 0.038955122232437134, 'timestamp': '2025-09-15 03:21:10.250437', 'step': 2118, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:10.281329', 'step': 2118, 'epoch': 3} {'type': 'loss', 'content': 0.0011777119943872094, 'timestamp': '2025-09-15 03:21:10.283430', 'step': 2119, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:10.313569', 'step': 2119, 'epoch': 3} {'type': 'loss', 'content': 0.0006529755191877484, 'timestamp': '2025-09-15 03:21:10.337184', 'step': 2120, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:21:10.367902', 'step': 2120, 'epoch': 3} {'type': 'loss', 'content': 0.0013485082890838385, 'timestamp': '2025-09-15 03:21:10.369919', 'step': 2121, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:10.400860', 'step': 2121, 'epoch': 3} {'type': 'loss', 'content': 0.025854647159576416, 'timestamp': '2025-09-15 03:21:10.402919', 'step': 2122, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:10.433430', 'step': 2122, 'epoch': 3} {'type': 'loss', 'content': 0.02108658477663994, 'timestamp': '2025-09-15 03:21:10.435446', 'step': 2123, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:10.465322', 'step': 2123, 'epoch': 3} {'type': 'loss', 'content': 0.001103846007026732, 'timestamp': '2025-09-15 03:21:10.489035', 'step': 2124, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:10.519710', 'step': 2124, 'epoch': 3} {'type': 'loss', 'content': 0.007358514703810215, 'timestamp': '2025-09-15 03:21:10.521856', 'step': 2125, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:10.552503', 'step': 2125, 'epoch': 3} {'type': 'loss', 'content': 0.005075577646493912, 'timestamp': '2025-09-15 03:21:10.554566', 'step': 2126, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:10.584500', 'step': 2126, 'epoch': 3} {'type': 'loss', 'content': 0.007805339992046356, 'timestamp': '2025-09-15 03:21:10.586806', 'step': 2127, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:10.616428', 'step': 2127, 'epoch': 3} {'type': 'loss', 'content': 0.013435539789497852, 'timestamp': '2025-09-15 03:21:10.639871', 'step': 2128, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:10.669886', 'step': 2128, 'epoch': 3} {'type': 'loss', 'content': 0.01014357153326273, 'timestamp': '2025-09-15 03:21:10.672026', 'step': 2129, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:10.703349', 'step': 2129, 'epoch': 3} {'type': 'loss', 'content': 0.0019475395092740655, 'timestamp': '2025-09-15 03:21:10.705559', 'step': 2130, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:10.735939', 'step': 2130, 'epoch': 3} {'type': 'loss', 'content': 0.0018576009897515178, 'timestamp': '2025-09-15 03:21:10.738032', 'step': 2131, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:10.768017', 'step': 2131, 'epoch': 3} {'type': 'loss', 'content': 0.006272417493164539, 'timestamp': '2025-09-15 03:21:10.791484', 'step': 2132, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:10.821780', 'step': 2132, 'epoch': 3} {'type': 'loss', 'content': 0.017416836693882942, 'timestamp': '2025-09-15 03:21:10.823819', 'step': 2133, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:10.854594', 'step': 2133, 'epoch': 3} {'type': 'loss', 'content': 0.0018346422584727407, 'timestamp': '2025-09-15 03:21:10.856629', 'step': 2134, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:10.887336', 'step': 2134, 'epoch': 3} {'type': 'loss', 'content': 0.002072213450446725, 'timestamp': '2025-09-15 03:21:10.889639', 'step': 2135, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:10.920267', 'step': 2135, 'epoch': 3} {'type': 'loss', 'content': 0.0008718724129721522, 'timestamp': '2025-09-15 03:21:10.943527', 'step': 2136, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:10.974610', 'step': 2136, 'epoch': 3} {'type': 'loss', 'content': 0.0033810038585215807, 'timestamp': '2025-09-15 03:21:10.976851', 'step': 2137, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:11.008118', 'step': 2137, 'epoch': 3} {'type': 'loss', 'content': 0.005350136663764715, 'timestamp': '2025-09-15 03:21:11.011770', 'step': 2138, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:11.048132', 'step': 2138, 'epoch': 3} {'type': 'loss', 'content': 0.0008165360777638853, 'timestamp': '2025-09-15 03:21:11.050158', 'step': 2139, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:11.080698', 'step': 2139, 'epoch': 3} {'type': 'loss', 'content': 0.0024626676458865404, 'timestamp': '2025-09-15 03:21:11.104499', 'step': 2140, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:11.136724', 'step': 2140, 'epoch': 3} {'type': 'loss', 'content': 0.0024115960113704205, 'timestamp': '2025-09-15 03:21:11.152656', 'step': 2141, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:11.191628', 'step': 2141, 'epoch': 3} {'type': 'loss', 'content': 0.006479751318693161, 'timestamp': '2025-09-15 03:21:11.194361', 'step': 2142, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:11.224551', 'step': 2142, 'epoch': 3} {'type': 'loss', 'content': 0.003326453035697341, 'timestamp': '2025-09-15 03:21:11.226650', 'step': 2143, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:11.256885', 'step': 2143, 'epoch': 3} {'type': 'loss', 'content': 0.0039823888801038265, 'timestamp': '2025-09-15 03:21:11.280947', 'step': 2144, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:11.312708', 'step': 2144, 'epoch': 3} {'type': 'loss', 'content': 0.0015015477547422051, 'timestamp': '2025-09-15 03:21:11.314248', 'step': 2145, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:11.344818', 'step': 2145, 'epoch': 3} {'type': 'loss', 'content': 0.00112843734677881, 'timestamp': '2025-09-15 03:21:11.346799', 'step': 2146, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:11.380143', 'step': 2146, 'epoch': 3} {'type': 'loss', 'content': 0.007287667598575354, 'timestamp': '2025-09-15 03:21:11.382202', 'step': 2147, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:11.412237', 'step': 2147, 'epoch': 3} {'type': 'loss', 'content': 0.001394521677866578, 'timestamp': '2025-09-15 03:21:11.435382', 'step': 2148, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:11.470791', 'step': 2148, 'epoch': 3} {'type': 'loss', 'content': 0.009362315759062767, 'timestamp': '2025-09-15 03:21:11.475008', 'step': 2149, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:11.505632', 'step': 2149, 'epoch': 3} {'type': 'loss', 'content': 0.0004021845816168934, 'timestamp': '2025-09-15 03:21:11.513474', 'step': 2150, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:11.544060', 'step': 2150, 'epoch': 3} {'type': 'loss', 'content': 0.008383884094655514, 'timestamp': '2025-09-15 03:21:11.547090', 'step': 2151, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:11.577810', 'step': 2151, 'epoch': 3} {'type': 'loss', 'content': 0.004849635995924473, 'timestamp': '2025-09-15 03:21:11.601340', 'step': 2152, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:11.633232', 'step': 2152, 'epoch': 3} {'type': 'loss', 'content': 0.004479328636080027, 'timestamp': '2025-09-15 03:21:11.635441', 'step': 2153, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:11.667934', 'step': 2153, 'epoch': 3} {'type': 'loss', 'content': 0.0022254232317209244, 'timestamp': '2025-09-15 03:21:11.671004', 'step': 2154, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:11.702775', 'step': 2154, 'epoch': 3} {'type': 'loss', 'content': 0.0015676198527216911, 'timestamp': '2025-09-15 03:21:11.705045', 'step': 2155, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:11.735864', 'step': 2155, 'epoch': 3} {'type': 'loss', 'content': 0.004587561823427677, 'timestamp': '2025-09-15 03:21:11.759918', 'step': 2156, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:11.796079', 'step': 2156, 'epoch': 3} {'type': 'loss', 'content': 0.0042901188135147095, 'timestamp': '2025-09-15 03:21:11.798427', 'step': 2157, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:11.828770', 'step': 2157, 'epoch': 3} {'type': 'loss', 'content': 0.0016539504285901785, 'timestamp': '2025-09-15 03:21:11.831852', 'step': 2158, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:11.862034', 'step': 2158, 'epoch': 3} {'type': 'loss', 'content': 0.0031304715666919947, 'timestamp': '2025-09-15 03:21:11.864189', 'step': 2159, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:11.896918', 'step': 2159, 'epoch': 3} {'type': 'loss', 'content': 0.002231004647910595, 'timestamp': '2025-09-15 03:21:11.920265', 'step': 2160, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:11.951703', 'step': 2160, 'epoch': 3} {'type': 'loss', 'content': 0.010940919630229473, 'timestamp': '2025-09-15 03:21:11.953750', 'step': 2161, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:21:11.984076', 'step': 2161, 'epoch': 3} {'type': 'loss', 'content': 0.007696912158280611, 'timestamp': '2025-09-15 03:21:11.986626', 'step': 2162, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:12.016941', 'step': 2162, 'epoch': 3} {'type': 'loss', 'content': 0.008095295168459415, 'timestamp': '2025-09-15 03:21:12.018710', 'step': 2163, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:12.048871', 'step': 2163, 'epoch': 3} {'type': 'loss', 'content': 0.005427549593150616, 'timestamp': '2025-09-15 03:21:12.072142', 'step': 2164, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:12.104165', 'step': 2164, 'epoch': 3} {'type': 'loss', 'content': 0.0006827453034929931, 'timestamp': '2025-09-15 03:21:12.106216', 'step': 2165, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:12.136734', 'step': 2165, 'epoch': 3} {'type': 'loss', 'content': 0.004006941802799702, 'timestamp': '2025-09-15 03:21:12.138836', 'step': 2166, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:21:12.880619', 'step': 2166, 'epoch': 3} {'type': 'pplx', 'content': 56358345.9592242, 'timestamp': '2025-09-15 03:21:12.882669', 'step': 2166, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:12.912218', 'step': 2166, 'epoch': 3} {'type': 'loss', 'content': 0.022966187447309494, 'timestamp': '2025-09-15 03:21:12.914406', 'step': 2167, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:12.944500', 'step': 2167, 'epoch': 3} {'type': 'loss', 'content': 0.0029004632961004972, 'timestamp': '2025-09-15 03:21:12.968545', 'step': 2168, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:12.999243', 'step': 2168, 'epoch': 3} {'type': 'loss', 'content': 0.005183414090424776, 'timestamp': '2025-09-15 03:21:13.001340', 'step': 2169, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:21:13.032356', 'step': 2169, 'epoch': 3} {'type': 'loss', 'content': 0.0002951786736957729, 'timestamp': '2025-09-15 03:21:13.034879', 'step': 2170, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:13.065815', 'step': 2170, 'epoch': 3} {'type': 'loss', 'content': 0.001299048657529056, 'timestamp': '2025-09-15 03:21:13.067917', 'step': 2171, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:13.098628', 'step': 2171, 'epoch': 3} {'type': 'loss', 'content': 0.0007869719411246479, 'timestamp': '2025-09-15 03:21:13.122233', 'step': 2172, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:13.153080', 'step': 2172, 'epoch': 3} {'type': 'loss', 'content': 0.0014901576796546578, 'timestamp': '2025-09-15 03:21:13.155302', 'step': 2173, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:13.186918', 'step': 2173, 'epoch': 3} {'type': 'loss', 'content': 0.018634099513292313, 'timestamp': '2025-09-15 03:21:13.189188', 'step': 2174, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:13.219873', 'step': 2174, 'epoch': 3} {'type': 'loss', 'content': 0.0033860153052955866, 'timestamp': '2025-09-15 03:21:13.221763', 'step': 2175, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:13.252492', 'step': 2175, 'epoch': 3} {'type': 'loss', 'content': 0.0015741854440420866, 'timestamp': '2025-09-15 03:21:13.275939', 'step': 2176, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:13.306567', 'step': 2176, 'epoch': 3} {'type': 'loss', 'content': 0.002750501735135913, 'timestamp': '2025-09-15 03:21:13.308507', 'step': 2177, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:13.340242', 'step': 2177, 'epoch': 3} {'type': 'loss', 'content': 0.07448706775903702, 'timestamp': '2025-09-15 03:21:13.342337', 'step': 2178, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:13.372602', 'step': 2178, 'epoch': 3} {'type': 'loss', 'content': 0.0007458441541530192, 'timestamp': '2025-09-15 03:21:13.375390', 'step': 2179, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:13.406151', 'step': 2179, 'epoch': 3} {'type': 'loss', 'content': 0.0018982002511620522, 'timestamp': '2025-09-15 03:21:13.429758', 'step': 2180, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:13.460680', 'step': 2180, 'epoch': 3} {'type': 'loss', 'content': 0.0009121177718043327, 'timestamp': '2025-09-15 03:21:13.462911', 'step': 2181, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:13.493418', 'step': 2181, 'epoch': 3} {'type': 'loss', 'content': 0.0008120434358716011, 'timestamp': '2025-09-15 03:21:13.495546', 'step': 2182, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:13.526592', 'step': 2182, 'epoch': 3} {'type': 'loss', 'content': 0.0011436872882768512, 'timestamp': '2025-09-15 03:21:13.528907', 'step': 2183, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:13.559433', 'step': 2183, 'epoch': 3} {'type': 'loss', 'content': 0.0037187892012298107, 'timestamp': '2025-09-15 03:21:13.583053', 'step': 2184, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:13.613327', 'step': 2184, 'epoch': 3} {'type': 'loss', 'content': 0.009429527446627617, 'timestamp': '2025-09-15 03:21:13.615437', 'step': 2185, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:13.648087', 'step': 2185, 'epoch': 3} {'type': 'loss', 'content': 0.0013286563334986567, 'timestamp': '2025-09-15 03:21:13.650170', 'step': 2186, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:13.680847', 'step': 2186, 'epoch': 3} {'type': 'loss', 'content': 0.02385484054684639, 'timestamp': '2025-09-15 03:21:13.683310', 'step': 2187, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:13.714257', 'step': 2187, 'epoch': 3} {'type': 'loss', 'content': 0.002276848303154111, 'timestamp': '2025-09-15 03:21:13.737795', 'step': 2188, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:13.768423', 'step': 2188, 'epoch': 3} {'type': 'loss', 'content': 0.005591376684606075, 'timestamp': '2025-09-15 03:21:13.770729', 'step': 2189, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:13.801012', 'step': 2189, 'epoch': 3} {'type': 'loss', 'content': 0.002838743384927511, 'timestamp': '2025-09-15 03:21:13.803036', 'step': 2190, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:13.833582', 'step': 2190, 'epoch': 3} {'type': 'loss', 'content': 0.010679845698177814, 'timestamp': '2025-09-15 03:21:13.835668', 'step': 2191, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:13.865645', 'step': 2191, 'epoch': 3} {'type': 'loss', 'content': 0.0006234788452275097, 'timestamp': '2025-09-15 03:21:13.889090', 'step': 2192, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:13.921076', 'step': 2192, 'epoch': 3} {'type': 'loss', 'content': 0.00037034088745713234, 'timestamp': '2025-09-15 03:21:13.923242', 'step': 2193, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:13.954154', 'step': 2193, 'epoch': 3} {'type': 'loss', 'content': 0.016694631427526474, 'timestamp': '2025-09-15 03:21:13.956542', 'step': 2194, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:13.986376', 'step': 2194, 'epoch': 3} {'type': 'loss', 'content': 0.000887808040715754, 'timestamp': '2025-09-15 03:21:13.988785', 'step': 2195, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:14.019258', 'step': 2195, 'epoch': 3} {'type': 'loss', 'content': 0.03821894899010658, 'timestamp': '2025-09-15 03:21:14.042892', 'step': 2196, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:14.073956', 'step': 2196, 'epoch': 3} {'type': 'loss', 'content': 0.018787646666169167, 'timestamp': '2025-09-15 03:21:14.075896', 'step': 2197, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:14.106724', 'step': 2197, 'epoch': 3} {'type': 'loss', 'content': 0.0020286752842366695, 'timestamp': '2025-09-15 03:21:14.108849', 'step': 2198, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:14.143937', 'step': 2198, 'epoch': 3} {'type': 'loss', 'content': 0.0011020256206393242, 'timestamp': '2025-09-15 03:21:14.145897', 'step': 2199, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:14.177252', 'step': 2199, 'epoch': 3} {'type': 'loss', 'content': 0.000470021681394428, 'timestamp': '2025-09-15 03:21:14.200843', 'step': 2200, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:21:14.231636', 'step': 2200, 'epoch': 3} {'type': 'loss', 'content': 0.0018282111268490553, 'timestamp': '2025-09-15 03:21:14.234022', 'step': 2201, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:14.264926', 'step': 2201, 'epoch': 3} {'type': 'loss', 'content': 0.01462145708501339, 'timestamp': '2025-09-15 03:21:14.267006', 'step': 2202, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:14.297935', 'step': 2202, 'epoch': 3} {'type': 'loss', 'content': 0.0006548243691213429, 'timestamp': '2025-09-15 03:21:14.299962', 'step': 2203, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:14.330723', 'step': 2203, 'epoch': 3} {'type': 'loss', 'content': 0.0006687435670755804, 'timestamp': '2025-09-15 03:21:14.354453', 'step': 2204, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:14.385874', 'step': 2204, 'epoch': 3} {'type': 'loss', 'content': 0.0005846781423315406, 'timestamp': '2025-09-15 03:21:14.388104', 'step': 2205, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:14.418666', 'step': 2205, 'epoch': 3} {'type': 'loss', 'content': 0.0019477332243695855, 'timestamp': '2025-09-15 03:21:14.420774', 'step': 2206, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:14.451662', 'step': 2206, 'epoch': 3} {'type': 'loss', 'content': 0.0016808919608592987, 'timestamp': '2025-09-15 03:21:14.453842', 'step': 2207, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:14.484567', 'step': 2207, 'epoch': 3} {'type': 'loss', 'content': 0.010064328089356422, 'timestamp': '2025-09-15 03:21:14.508105', 'step': 2208, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:14.539688', 'step': 2208, 'epoch': 3} {'type': 'loss', 'content': 0.0018260888755321503, 'timestamp': '2025-09-15 03:21:14.542077', 'step': 2209, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:14.572446', 'step': 2209, 'epoch': 3} {'type': 'loss', 'content': 0.002042062347754836, 'timestamp': '2025-09-15 03:21:14.574751', 'step': 2210, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:14.605689', 'step': 2210, 'epoch': 3} {'type': 'loss', 'content': 0.000529797631315887, 'timestamp': '2025-09-15 03:21:14.607863', 'step': 2211, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:14.638139', 'step': 2211, 'epoch': 3} {'type': 'loss', 'content': 0.001175111741758883, 'timestamp': '2025-09-15 03:21:14.661647', 'step': 2212, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:14.691892', 'step': 2212, 'epoch': 3} {'type': 'loss', 'content': 0.00947808288037777, 'timestamp': '2025-09-15 03:21:14.694066', 'step': 2213, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:14.724412', 'step': 2213, 'epoch': 3} {'type': 'loss', 'content': 0.0023165601305663586, 'timestamp': '2025-09-15 03:21:14.726573', 'step': 2214, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:14.757033', 'step': 2214, 'epoch': 3} {'type': 'loss', 'content': 0.0019492261344566941, 'timestamp': '2025-09-15 03:21:14.758985', 'step': 2215, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:14.790110', 'step': 2215, 'epoch': 3} {'type': 'loss', 'content': 0.012912404723465443, 'timestamp': '2025-09-15 03:21:14.813541', 'step': 2216, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:14.845333', 'step': 2216, 'epoch': 3} {'type': 'loss', 'content': 0.008791116066277027, 'timestamp': '2025-09-15 03:21:14.847413', 'step': 2217, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:14.878065', 'step': 2217, 'epoch': 3} {'type': 'loss', 'content': 0.02137189917266369, 'timestamp': '2025-09-15 03:21:14.880182', 'step': 2218, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:14.911002', 'step': 2218, 'epoch': 3} {'type': 'loss', 'content': 0.0007815445424057543, 'timestamp': '2025-09-15 03:21:14.913266', 'step': 2219, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:14.945319', 'step': 2219, 'epoch': 3} {'type': 'loss', 'content': 0.0015050854999572039, 'timestamp': '2025-09-15 03:21:14.968862', 'step': 2220, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:14.999022', 'step': 2220, 'epoch': 3} {'type': 'loss', 'content': 0.00025126116815954447, 'timestamp': '2025-09-15 03:21:15.001187', 'step': 2221, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:15.031485', 'step': 2221, 'epoch': 3} {'type': 'loss', 'content': 0.0011576504912227392, 'timestamp': '2025-09-15 03:21:15.033513', 'step': 2222, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:15.065030', 'step': 2222, 'epoch': 3} {'type': 'loss', 'content': 0.00018423503206577152, 'timestamp': '2025-09-15 03:21:15.069373', 'step': 2223, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:21:15.807235', 'step': 2223, 'epoch': 3} {'type': 'pplx', 'content': 62849603.449168004, 'timestamp': '2025-09-15 03:21:15.809112', 'step': 2223, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:15.837733', 'step': 2223, 'epoch': 3} {'type': 'loss', 'content': 0.002890202449634671, 'timestamp': '2025-09-15 03:21:15.861190', 'step': 2224, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:15.891662', 'step': 2224, 'epoch': 3} {'type': 'loss', 'content': 0.001774442265741527, 'timestamp': '2025-09-15 03:21:15.893634', 'step': 2225, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:15.923668', 'step': 2225, 'epoch': 3} {'type': 'loss', 'content': 0.0007153319893404841, 'timestamp': '2025-09-15 03:21:15.925724', 'step': 2226, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:15.956847', 'step': 2226, 'epoch': 3} {'type': 'loss', 'content': 0.01221728976815939, 'timestamp': '2025-09-15 03:21:15.958817', 'step': 2227, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:21:15.989652', 'step': 2227, 'epoch': 3} {'type': 'loss', 'content': 0.01373725850135088, 'timestamp': '2025-09-15 03:21:16.013004', 'step': 2228, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:16.043676', 'step': 2228, 'epoch': 3} {'type': 'loss', 'content': 0.00021498788555618376, 'timestamp': '2025-09-15 03:21:16.045661', 'step': 2229, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:16.075571', 'step': 2229, 'epoch': 3} {'type': 'loss', 'content': 0.0012636370956897736, 'timestamp': '2025-09-15 03:21:16.077641', 'step': 2230, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:16.107738', 'step': 2230, 'epoch': 3} {'type': 'loss', 'content': 9.673281601862982e-05, 'timestamp': '2025-09-15 03:21:16.109877', 'step': 2231, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:16.140124', 'step': 2231, 'epoch': 3} {'type': 'loss', 'content': 0.012603862211108208, 'timestamp': '2025-09-15 03:21:16.163586', 'step': 2232, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:16.194597', 'step': 2232, 'epoch': 3} {'type': 'loss', 'content': 0.0013240514090284705, 'timestamp': '2025-09-15 03:21:16.196850', 'step': 2233, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:16.227367', 'step': 2233, 'epoch': 3} {'type': 'loss', 'content': 0.0012036258121952415, 'timestamp': '2025-09-15 03:21:16.229378', 'step': 2234, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:16.260090', 'step': 2234, 'epoch': 3} {'type': 'loss', 'content': 0.0036719846539199352, 'timestamp': '2025-09-15 03:21:16.262339', 'step': 2235, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:16.292842', 'step': 2235, 'epoch': 3} {'type': 'loss', 'content': 0.0012049399083480239, 'timestamp': '2025-09-15 03:21:16.316527', 'step': 2236, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:16.346845', 'step': 2236, 'epoch': 3} {'type': 'loss', 'content': 0.0011176351690664887, 'timestamp': '2025-09-15 03:21:16.348838', 'step': 2237, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:16.384654', 'step': 2237, 'epoch': 3} {'type': 'loss', 'content': 0.006942094769328833, 'timestamp': '2025-09-15 03:21:16.386594', 'step': 2238, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:16.417748', 'step': 2238, 'epoch': 3} {'type': 'loss', 'content': 0.019507378339767456, 'timestamp': '2025-09-15 03:21:16.419628', 'step': 2239, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:16.450329', 'step': 2239, 'epoch': 3} {'type': 'loss', 'content': 0.0027631439734250307, 'timestamp': '2025-09-15 03:21:16.473991', 'step': 2240, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:16.504227', 'step': 2240, 'epoch': 3} {'type': 'loss', 'content': 0.0008822910604067147, 'timestamp': '2025-09-15 03:21:16.506257', 'step': 2241, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:16.537441', 'step': 2241, 'epoch': 3} {'type': 'loss', 'content': 0.00015964095655363053, 'timestamp': '2025-09-15 03:21:16.539530', 'step': 2242, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:16.570264', 'step': 2242, 'epoch': 3} {'type': 'loss', 'content': 0.0007311701192520559, 'timestamp': '2025-09-15 03:21:16.572110', 'step': 2243, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:16.605715', 'step': 2243, 'epoch': 3} {'type': 'loss', 'content': 0.002480999333783984, 'timestamp': '2025-09-15 03:21:16.629277', 'step': 2244, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:16.659788', 'step': 2244, 'epoch': 3} {'type': 'loss', 'content': 0.0024871390778571367, 'timestamp': '2025-09-15 03:21:16.661685', 'step': 2245, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:16.691599', 'step': 2245, 'epoch': 3} {'type': 'loss', 'content': 0.006760665215551853, 'timestamp': '2025-09-15 03:21:16.693867', 'step': 2246, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:16.724268', 'step': 2246, 'epoch': 3} {'type': 'loss', 'content': 0.0028482077177613974, 'timestamp': '2025-09-15 03:21:16.726423', 'step': 2247, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:16.757079', 'step': 2247, 'epoch': 3} {'type': 'loss', 'content': 0.004411118105053902, 'timestamp': '2025-09-15 03:21:16.780465', 'step': 2248, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:16.811929', 'step': 2248, 'epoch': 3} {'type': 'loss', 'content': 0.004571598023176193, 'timestamp': '2025-09-15 03:21:16.819061', 'step': 2249, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:16.857899', 'step': 2249, 'epoch': 3} {'type': 'loss', 'content': 0.00016060953203123063, 'timestamp': '2025-09-15 03:21:16.860019', 'step': 2250, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:16.891229', 'step': 2250, 'epoch': 3} {'type': 'loss', 'content': 0.0028591505251824856, 'timestamp': '2025-09-15 03:21:16.893416', 'step': 2251, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:16.924543', 'step': 2251, 'epoch': 3} {'type': 'loss', 'content': 0.004463012330234051, 'timestamp': '2025-09-15 03:21:16.947868', 'step': 2252, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:16.978508', 'step': 2252, 'epoch': 3} {'type': 'loss', 'content': 0.001512286951765418, 'timestamp': '2025-09-15 03:21:16.980651', 'step': 2253, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:17.012210', 'step': 2253, 'epoch': 3} {'type': 'loss', 'content': 0.000547349511180073, 'timestamp': '2025-09-15 03:21:17.013936', 'step': 2254, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:17.045543', 'step': 2254, 'epoch': 3} {'type': 'loss', 'content': 0.0014820124488323927, 'timestamp': '2025-09-15 03:21:17.047769', 'step': 2255, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:17.077997', 'step': 2255, 'epoch': 3} {'type': 'loss', 'content': 8.133659866871312e-05, 'timestamp': '2025-09-15 03:21:17.101492', 'step': 2256, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:17.132476', 'step': 2256, 'epoch': 3} {'type': 'loss', 'content': 8.59004576341249e-05, 'timestamp': '2025-09-15 03:21:17.134733', 'step': 2257, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:17.167674', 'step': 2257, 'epoch': 3} {'type': 'loss', 'content': 0.0020845939870923758, 'timestamp': '2025-09-15 03:21:17.169736', 'step': 2258, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:17.200606', 'step': 2258, 'epoch': 3} {'type': 'loss', 'content': 0.0009439904824830592, 'timestamp': '2025-09-15 03:21:17.203663', 'step': 2259, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:17.233793', 'step': 2259, 'epoch': 3} {'type': 'loss', 'content': 0.004552639089524746, 'timestamp': '2025-09-15 03:21:17.259120', 'step': 2260, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:17.294296', 'step': 2260, 'epoch': 3} {'type': 'loss', 'content': 0.002391277113929391, 'timestamp': '2025-09-15 03:21:17.299354', 'step': 2261, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:17.330817', 'step': 2261, 'epoch': 3} {'type': 'loss', 'content': 0.0007121101371012628, 'timestamp': '2025-09-15 03:21:17.333362', 'step': 2262, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:17.365177', 'step': 2262, 'epoch': 3} {'type': 'loss', 'content': 0.004409910179674625, 'timestamp': '2025-09-15 03:21:17.367233', 'step': 2263, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:17.398381', 'step': 2263, 'epoch': 3} {'type': 'loss', 'content': 0.001686051837168634, 'timestamp': '2025-09-15 03:21:17.421980', 'step': 2264, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:17.452806', 'step': 2264, 'epoch': 3} {'type': 'loss', 'content': 7.227421883726493e-05, 'timestamp': '2025-09-15 03:21:17.455045', 'step': 2265, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:17.485671', 'step': 2265, 'epoch': 3} {'type': 'loss', 'content': 0.0006697883945889771, 'timestamp': '2025-09-15 03:21:17.487839', 'step': 2266, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:17.518474', 'step': 2266, 'epoch': 3} {'type': 'loss', 'content': 0.002797195687890053, 'timestamp': '2025-09-15 03:21:17.520562', 'step': 2267, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:21:17.550575', 'step': 2267, 'epoch': 3} {'type': 'loss', 'content': 0.0032074761111289263, 'timestamp': '2025-09-15 03:21:17.573993', 'step': 2268, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:17.604180', 'step': 2268, 'epoch': 3} {'type': 'loss', 'content': 0.00026407671975903213, 'timestamp': '2025-09-15 03:21:17.606205', 'step': 2269, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:17.636422', 'step': 2269, 'epoch': 3} {'type': 'loss', 'content': 0.0003585081431083381, 'timestamp': '2025-09-15 03:21:17.639759', 'step': 2270, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:17.670297', 'step': 2270, 'epoch': 3} {'type': 'loss', 'content': 0.000742782314773649, 'timestamp': '2025-09-15 03:21:17.672400', 'step': 2271, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:17.703628', 'step': 2271, 'epoch': 3} {'type': 'loss', 'content': 0.0006893987883813679, 'timestamp': '2025-09-15 03:21:17.727178', 'step': 2272, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:17.757568', 'step': 2272, 'epoch': 3} {'type': 'loss', 'content': 0.0043635121546685696, 'timestamp': '2025-09-15 03:21:17.759543', 'step': 2273, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:17.789442', 'step': 2273, 'epoch': 3} {'type': 'loss', 'content': 0.002095921663567424, 'timestamp': '2025-09-15 03:21:17.791184', 'step': 2274, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:17.821480', 'step': 2274, 'epoch': 3} {'type': 'loss', 'content': 0.0013823070330545306, 'timestamp': '2025-09-15 03:21:17.823469', 'step': 2275, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:17.853314', 'step': 2275, 'epoch': 3} {'type': 'loss', 'content': 0.006985916756093502, 'timestamp': '2025-09-15 03:21:17.876854', 'step': 2276, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:17.907485', 'step': 2276, 'epoch': 3} {'type': 'loss', 'content': 0.00044175630318932235, 'timestamp': '2025-09-15 03:21:17.909554', 'step': 2277, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:17.939414', 'step': 2277, 'epoch': 3} {'type': 'loss', 'content': 0.03420780971646309, 'timestamp': '2025-09-15 03:21:17.941547', 'step': 2278, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:17.973125', 'step': 2278, 'epoch': 3} {'type': 'loss', 'content': 0.00027540497831068933, 'timestamp': '2025-09-15 03:21:17.975592', 'step': 2279, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:18.007275', 'step': 2279, 'epoch': 3} {'type': 'loss', 'content': 0.00519139226526022, 'timestamp': '2025-09-15 03:21:18.030680', 'step': 2280, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:21:18.770531', 'step': 2280, 'epoch': 3} {'type': 'pplx', 'content': 67494214.18423404, 'timestamp': '2025-09-15 03:21:18.772658', 'step': 2280, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:18.800657', 'step': 2280, 'epoch': 3} {'type': 'loss', 'content': 0.00019070318376179785, 'timestamp': '2025-09-15 03:21:18.802774', 'step': 2281, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:18.832388', 'step': 2281, 'epoch': 3} {'type': 'loss', 'content': 0.00020345590019132942, 'timestamp': '2025-09-15 03:21:18.834650', 'step': 2282, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:18.864868', 'step': 2282, 'epoch': 3} {'type': 'loss', 'content': 8.062987762968987e-05, 'timestamp': '2025-09-15 03:21:18.867230', 'step': 2283, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:18.897898', 'step': 2283, 'epoch': 3} {'type': 'loss', 'content': 0.0009913406101986766, 'timestamp': '2025-09-15 03:21:18.921471', 'step': 2284, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:18.951838', 'step': 2284, 'epoch': 3} {'type': 'loss', 'content': 0.0008175976690836251, 'timestamp': '2025-09-15 03:21:18.953996', 'step': 2285, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:18.985740', 'step': 2285, 'epoch': 3} {'type': 'loss', 'content': 7.580334931844845e-05, 'timestamp': '2025-09-15 03:21:18.988295', 'step': 2286, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:19.019438', 'step': 2286, 'epoch': 3} {'type': 'loss', 'content': 0.00028774369275197387, 'timestamp': '2025-09-15 03:21:19.021740', 'step': 2287, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:21:19.054509', 'step': 2287, 'epoch': 3} {'type': 'loss', 'content': 0.0005485046422109008, 'timestamp': '2025-09-15 03:21:19.080024', 'step': 2288, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:19.110975', 'step': 2288, 'epoch': 3} {'type': 'loss', 'content': 0.00015744587290100753, 'timestamp': '2025-09-15 03:21:19.113163', 'step': 2289, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:19.144343', 'step': 2289, 'epoch': 3} {'type': 'loss', 'content': 0.0001991561584873125, 'timestamp': '2025-09-15 03:21:19.146647', 'step': 2290, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:19.177367', 'step': 2290, 'epoch': 3} {'type': 'loss', 'content': 0.008997390046715736, 'timestamp': '2025-09-15 03:21:19.179781', 'step': 2291, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:19.210684', 'step': 2291, 'epoch': 3} {'type': 'loss', 'content': 0.002861560555174947, 'timestamp': '2025-09-15 03:21:19.234419', 'step': 2292, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:19.264635', 'step': 2292, 'epoch': 3} {'type': 'loss', 'content': 0.0005258307792246342, 'timestamp': '2025-09-15 03:21:19.266801', 'step': 2293, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:19.296948', 'step': 2293, 'epoch': 3} {'type': 'loss', 'content': 0.0035565574653446674, 'timestamp': '2025-09-15 03:21:19.299181', 'step': 2294, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:19.329233', 'step': 2294, 'epoch': 3} {'type': 'loss', 'content': 0.001044388976879418, 'timestamp': '2025-09-15 03:21:19.331464', 'step': 2295, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:19.361435', 'step': 2295, 'epoch': 3} {'type': 'loss', 'content': 0.0002443413541186601, 'timestamp': '2025-09-15 03:21:19.385263', 'step': 2296, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:19.415785', 'step': 2296, 'epoch': 3} {'type': 'loss', 'content': 0.0006027042982168496, 'timestamp': '2025-09-15 03:21:19.417901', 'step': 2297, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:19.448106', 'step': 2297, 'epoch': 3} {'type': 'loss', 'content': 0.0026935446076095104, 'timestamp': '2025-09-15 03:21:19.450301', 'step': 2298, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:19.480885', 'step': 2298, 'epoch': 3} {'type': 'loss', 'content': 0.00013987746206112206, 'timestamp': '2025-09-15 03:21:19.483185', 'step': 2299, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:19.512985', 'step': 2299, 'epoch': 3} {'type': 'loss', 'content': 0.0003008446656167507, 'timestamp': '2025-09-15 03:21:19.537256', 'step': 2300, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:19.567766', 'step': 2300, 'epoch': 3} {'type': 'loss', 'content': 0.0002455053327139467, 'timestamp': '2025-09-15 03:21:19.569739', 'step': 2301, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:19.599903', 'step': 2301, 'epoch': 3} {'type': 'loss', 'content': 7.855286821722984e-05, 'timestamp': '2025-09-15 03:21:19.601961', 'step': 2302, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:19.633244', 'step': 2302, 'epoch': 3} {'type': 'loss', 'content': 0.018144680187106133, 'timestamp': '2025-09-15 03:21:19.635340', 'step': 2303, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:19.665144', 'step': 2303, 'epoch': 3} {'type': 'loss', 'content': 0.0017443523975089192, 'timestamp': '2025-09-15 03:21:19.688834', 'step': 2304, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:19.719007', 'step': 2304, 'epoch': 3} {'type': 'loss', 'content': 0.00037380401045084, 'timestamp': '2025-09-15 03:21:19.721921', 'step': 2305, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:19.751988', 'step': 2305, 'epoch': 3} {'type': 'loss', 'content': 0.0004997936775907874, 'timestamp': '2025-09-15 03:21:19.754299', 'step': 2306, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:19.784702', 'step': 2306, 'epoch': 3} {'type': 'loss', 'content': 0.014436488971114159, 'timestamp': '2025-09-15 03:21:19.786890', 'step': 2307, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:19.817394', 'step': 2307, 'epoch': 3} {'type': 'loss', 'content': 0.00046557295718230307, 'timestamp': '2025-09-15 03:21:19.841063', 'step': 2308, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:19.871549', 'step': 2308, 'epoch': 3} {'type': 'loss', 'content': 0.00021677868789993227, 'timestamp': '2025-09-15 03:21:19.873851', 'step': 2309, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:19.904416', 'step': 2309, 'epoch': 3} {'type': 'loss', 'content': 0.0003104743082076311, 'timestamp': '2025-09-15 03:21:19.906634', 'step': 2310, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:19.936762', 'step': 2310, 'epoch': 3} {'type': 'loss', 'content': 0.0001938850909937173, 'timestamp': '2025-09-15 03:21:19.938781', 'step': 2311, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:19.969518', 'step': 2311, 'epoch': 3} {'type': 'loss', 'content': 0.0005853885086253285, 'timestamp': '2025-09-15 03:21:19.992993', 'step': 2312, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:20.023242', 'step': 2312, 'epoch': 3} {'type': 'loss', 'content': 0.00036151515087112784, 'timestamp': '2025-09-15 03:21:20.025591', 'step': 2313, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:20.055885', 'step': 2313, 'epoch': 3} {'type': 'loss', 'content': 8.350707503268495e-05, 'timestamp': '2025-09-15 03:21:20.058007', 'step': 2314, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:20.087636', 'step': 2314, 'epoch': 3} {'type': 'loss', 'content': 0.0010728834895417094, 'timestamp': '2025-09-15 03:21:20.089729', 'step': 2315, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:20.119777', 'step': 2315, 'epoch': 3} {'type': 'loss', 'content': 0.00021373596973717213, 'timestamp': '2025-09-15 03:21:20.143249', 'step': 2316, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:20.173933', 'step': 2316, 'epoch': 3} {'type': 'loss', 'content': 8.931905904319137e-05, 'timestamp': '2025-09-15 03:21:20.176033', 'step': 2317, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:20.208500', 'step': 2317, 'epoch': 3} {'type': 'loss', 'content': 0.0002284930378664285, 'timestamp': '2025-09-15 03:21:20.210577', 'step': 2318, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:20.242319', 'step': 2318, 'epoch': 3} {'type': 'loss', 'content': 0.00015003184671513736, 'timestamp': '2025-09-15 03:21:20.244631', 'step': 2319, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:20.276639', 'step': 2319, 'epoch': 3} {'type': 'loss', 'content': 0.0008775495225563645, 'timestamp': '2025-09-15 03:21:20.300191', 'step': 2320, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:20.331757', 'step': 2320, 'epoch': 3} {'type': 'loss', 'content': 0.0004940013168379664, 'timestamp': '2025-09-15 03:21:20.333891', 'step': 2321, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:20.364748', 'step': 2321, 'epoch': 3} {'type': 'loss', 'content': 0.00012955373676959425, 'timestamp': '2025-09-15 03:21:20.366796', 'step': 2322, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:20.397555', 'step': 2322, 'epoch': 3} {'type': 'loss', 'content': 0.00020479969680309296, 'timestamp': '2025-09-15 03:21:20.399821', 'step': 2323, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:20.430759', 'step': 2323, 'epoch': 3} {'type': 'loss', 'content': 0.00027445584419183433, 'timestamp': '2025-09-15 03:21:20.454353', 'step': 2324, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:20.485387', 'step': 2324, 'epoch': 3} {'type': 'loss', 'content': 0.00012975472782272846, 'timestamp': '2025-09-15 03:21:20.487673', 'step': 2325, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:20.519308', 'step': 2325, 'epoch': 3} {'type': 'loss', 'content': 0.0025794480461627245, 'timestamp': '2025-09-15 03:21:20.521621', 'step': 2326, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:20.551687', 'step': 2326, 'epoch': 3} {'type': 'loss', 'content': 0.005738633684813976, 'timestamp': '2025-09-15 03:21:20.553942', 'step': 2327, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:20.585340', 'step': 2327, 'epoch': 3} {'type': 'loss', 'content': 0.0008769879932515323, 'timestamp': '2025-09-15 03:21:20.608962', 'step': 2328, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:20.639383', 'step': 2328, 'epoch': 3} {'type': 'loss', 'content': 0.0002040156105067581, 'timestamp': '2025-09-15 03:21:20.641512', 'step': 2329, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:20.671473', 'step': 2329, 'epoch': 3} {'type': 'loss', 'content': 0.00022350263316184282, 'timestamp': '2025-09-15 03:21:20.673708', 'step': 2330, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:20.704482', 'step': 2330, 'epoch': 3} {'type': 'loss', 'content': 0.00019051216077059507, 'timestamp': '2025-09-15 03:21:20.706634', 'step': 2331, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:20.738146', 'step': 2331, 'epoch': 3} {'type': 'loss', 'content': 0.001020329655148089, 'timestamp': '2025-09-15 03:21:20.761649', 'step': 2332, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:20.792571', 'step': 2332, 'epoch': 3} {'type': 'loss', 'content': 0.00016312638763338327, 'timestamp': '2025-09-15 03:21:20.794977', 'step': 2333, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:20.825203', 'step': 2333, 'epoch': 3} {'type': 'loss', 'content': 0.00018660161003936082, 'timestamp': '2025-09-15 03:21:20.827348', 'step': 2334, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:20.857435', 'step': 2334, 'epoch': 3} {'type': 'loss', 'content': 0.013526364229619503, 'timestamp': '2025-09-15 03:21:20.859717', 'step': 2335, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:20.890701', 'step': 2335, 'epoch': 3} {'type': 'loss', 'content': 0.0001784945634426549, 'timestamp': '2025-09-15 03:21:20.914265', 'step': 2336, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:20.944920', 'step': 2336, 'epoch': 3} {'type': 'loss', 'content': 0.003306002588942647, 'timestamp': '2025-09-15 03:21:20.946961', 'step': 2337, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:21:21.679516', 'step': 2337, 'epoch': 3} {'type': 'pplx', 'content': 60885470.31679232, 'timestamp': '2025-09-15 03:21:21.681134', 'step': 2337, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:21.709786', 'step': 2337, 'epoch': 3} {'type': 'loss', 'content': 0.00014974501391407102, 'timestamp': '2025-09-15 03:21:21.712019', 'step': 2338, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:21.741925', 'step': 2338, 'epoch': 3} {'type': 'loss', 'content': 0.00010141759412363172, 'timestamp': '2025-09-15 03:21:21.743945', 'step': 2339, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:21.774134', 'step': 2339, 'epoch': 3} {'type': 'loss', 'content': 0.0013717318652197719, 'timestamp': '2025-09-15 03:21:21.797646', 'step': 2340, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:21.828752', 'step': 2340, 'epoch': 3} {'type': 'loss', 'content': 0.0010869607795029879, 'timestamp': '2025-09-15 03:21:21.831018', 'step': 2341, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:21.860961', 'step': 2341, 'epoch': 3} {'type': 'loss', 'content': 0.001997420797124505, 'timestamp': '2025-09-15 03:21:21.863369', 'step': 2342, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:21.894261', 'step': 2342, 'epoch': 3} {'type': 'loss', 'content': 0.00011396995250834152, 'timestamp': '2025-09-15 03:21:21.896439', 'step': 2343, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:21.926555', 'step': 2343, 'epoch': 3} {'type': 'loss', 'content': 0.0003269371227361262, 'timestamp': '2025-09-15 03:21:21.949966', 'step': 2344, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:21.980383', 'step': 2344, 'epoch': 3} {'type': 'loss', 'content': 0.010744208469986916, 'timestamp': '2025-09-15 03:21:21.982465', 'step': 2345, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:22.012953', 'step': 2345, 'epoch': 3} {'type': 'loss', 'content': 0.00023648412025067955, 'timestamp': '2025-09-15 03:21:22.015623', 'step': 2346, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:22.045606', 'step': 2346, 'epoch': 3} {'type': 'loss', 'content': 0.00028562903753481805, 'timestamp': '2025-09-15 03:21:22.047791', 'step': 2347, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:22.078104', 'step': 2347, 'epoch': 3} {'type': 'loss', 'content': 0.00023739961034152657, 'timestamp': '2025-09-15 03:21:22.101647', 'step': 2348, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:22.131433', 'step': 2348, 'epoch': 3} {'type': 'loss', 'content': 0.00021465822646860033, 'timestamp': '2025-09-15 03:21:22.133327', 'step': 2349, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:22.163473', 'step': 2349, 'epoch': 3} {'type': 'loss', 'content': 0.002316099824383855, 'timestamp': '2025-09-15 03:21:22.165609', 'step': 2350, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:22.195887', 'step': 2350, 'epoch': 3} {'type': 'loss', 'content': 0.0008549098274670541, 'timestamp': '2025-09-15 03:21:22.198549', 'step': 2351, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:22.228745', 'step': 2351, 'epoch': 3} {'type': 'loss', 'content': 0.0005782007938250899, 'timestamp': '2025-09-15 03:21:22.252244', 'step': 2352, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:22.282795', 'step': 2352, 'epoch': 3} {'type': 'loss', 'content': 0.00010357372957514599, 'timestamp': '2025-09-15 03:21:22.284879', 'step': 2353, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:22.316229', 'step': 2353, 'epoch': 3} {'type': 'loss', 'content': 0.0057270945981144905, 'timestamp': '2025-09-15 03:21:22.318317', 'step': 2354, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:22.348685', 'step': 2354, 'epoch': 3} {'type': 'loss', 'content': 0.0004270931822247803, 'timestamp': '2025-09-15 03:21:22.350847', 'step': 2355, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:22.381214', 'step': 2355, 'epoch': 3} {'type': 'loss', 'content': 0.00011959804396610707, 'timestamp': '2025-09-15 03:21:22.404826', 'step': 2356, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:22.435320', 'step': 2356, 'epoch': 3} {'type': 'loss', 'content': 0.0002549807832110673, 'timestamp': '2025-09-15 03:21:22.437394', 'step': 2357, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:22.468229', 'step': 2357, 'epoch': 3} {'type': 'loss', 'content': 0.0006122398190200329, 'timestamp': '2025-09-15 03:21:22.470293', 'step': 2358, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:22.500178', 'step': 2358, 'epoch': 3} {'type': 'loss', 'content': 0.0005783793749287724, 'timestamp': '2025-09-15 03:21:22.502304', 'step': 2359, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:22.532713', 'step': 2359, 'epoch': 3} {'type': 'loss', 'content': 0.005947014782577753, 'timestamp': '2025-09-15 03:21:22.556353', 'step': 2360, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:22.586717', 'step': 2360, 'epoch': 3} {'type': 'loss', 'content': 0.00028083566576242447, 'timestamp': '2025-09-15 03:21:22.588954', 'step': 2361, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:22.618813', 'step': 2361, 'epoch': 3} {'type': 'loss', 'content': 0.00033108098432421684, 'timestamp': '2025-09-15 03:21:22.620940', 'step': 2362, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:22.650854', 'step': 2362, 'epoch': 3} {'type': 'loss', 'content': 0.0001367040240438655, 'timestamp': '2025-09-15 03:21:22.652864', 'step': 2363, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:22.682727', 'step': 2363, 'epoch': 3} {'type': 'loss', 'content': 0.00224878778681159, 'timestamp': '2025-09-15 03:21:22.705846', 'step': 2364, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:22.736412', 'step': 2364, 'epoch': 3} {'type': 'loss', 'content': 0.0010682273423299193, 'timestamp': '2025-09-15 03:21:22.738755', 'step': 2365, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:22.769505', 'step': 2365, 'epoch': 3} {'type': 'loss', 'content': 0.005659495014697313, 'timestamp': '2025-09-15 03:21:22.771479', 'step': 2366, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:22.801326', 'step': 2366, 'epoch': 3} {'type': 'loss', 'content': 0.006955933757126331, 'timestamp': '2025-09-15 03:21:22.803365', 'step': 2367, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:22.834409', 'step': 2367, 'epoch': 3} {'type': 'loss', 'content': 0.0006349317845888436, 'timestamp': '2025-09-15 03:21:22.857825', 'step': 2368, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:22.887859', 'step': 2368, 'epoch': 3} {'type': 'loss', 'content': 0.0021447453182190657, 'timestamp': '2025-09-15 03:21:22.889863', 'step': 2369, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:22.919385', 'step': 2369, 'epoch': 3} {'type': 'loss', 'content': 0.0001313880638917908, 'timestamp': '2025-09-15 03:21:22.921417', 'step': 2370, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:22.951515', 'step': 2370, 'epoch': 3} {'type': 'loss', 'content': 0.00024139614833984524, 'timestamp': '2025-09-15 03:21:22.953681', 'step': 2371, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:22.984772', 'step': 2371, 'epoch': 3} {'type': 'loss', 'content': 0.00474166963249445, 'timestamp': '2025-09-15 03:21:23.008291', 'step': 2372, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:23.038395', 'step': 2372, 'epoch': 3} {'type': 'loss', 'content': 0.0003615278110373765, 'timestamp': '2025-09-15 03:21:23.040435', 'step': 2373, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:23.071395', 'step': 2373, 'epoch': 3} {'type': 'loss', 'content': 0.0063685099594295025, 'timestamp': '2025-09-15 03:21:23.073744', 'step': 2374, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:23.104428', 'step': 2374, 'epoch': 3} {'type': 'loss', 'content': 0.0005614294786937535, 'timestamp': '2025-09-15 03:21:23.106666', 'step': 2375, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:23.138159', 'step': 2375, 'epoch': 3} {'type': 'loss', 'content': 0.04361369088292122, 'timestamp': '2025-09-15 03:21:23.161706', 'step': 2376, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:23.191616', 'step': 2376, 'epoch': 3} {'type': 'loss', 'content': 0.00016917834000196308, 'timestamp': '2025-09-15 03:21:23.193382', 'step': 2377, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:23.223222', 'step': 2377, 'epoch': 3} {'type': 'loss', 'content': 0.0740358904004097, 'timestamp': '2025-09-15 03:21:23.225488', 'step': 2378, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:23.255712', 'step': 2378, 'epoch': 3} {'type': 'loss', 'content': 0.0010343164904043078, 'timestamp': '2025-09-15 03:21:23.257944', 'step': 2379, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:23.288526', 'step': 2379, 'epoch': 3} {'type': 'loss', 'content': 0.0007979201036505401, 'timestamp': '2025-09-15 03:21:23.311986', 'step': 2380, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:23.342550', 'step': 2380, 'epoch': 3} {'type': 'loss', 'content': 0.00026409697602503, 'timestamp': '2025-09-15 03:21:23.344552', 'step': 2381, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:23.374381', 'step': 2381, 'epoch': 3} {'type': 'loss', 'content': 0.031576044857501984, 'timestamp': '2025-09-15 03:21:23.376382', 'step': 2382, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:23.406257', 'step': 2382, 'epoch': 3} {'type': 'loss', 'content': 0.0009410924976691604, 'timestamp': '2025-09-15 03:21:23.408369', 'step': 2383, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:23.438367', 'step': 2383, 'epoch': 3} {'type': 'loss', 'content': 0.006526883225888014, 'timestamp': '2025-09-15 03:21:23.461644', 'step': 2384, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:23.493187', 'step': 2384, 'epoch': 3} {'type': 'loss', 'content': 0.0004621174302883446, 'timestamp': '2025-09-15 03:21:23.495243', 'step': 2385, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:23.525161', 'step': 2385, 'epoch': 3} {'type': 'loss', 'content': 0.00030433182837441564, 'timestamp': '2025-09-15 03:21:23.527347', 'step': 2386, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:23.558758', 'step': 2386, 'epoch': 3} {'type': 'loss', 'content': 0.001242363709025085, 'timestamp': '2025-09-15 03:21:23.560809', 'step': 2387, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:23.591637', 'step': 2387, 'epoch': 3} {'type': 'loss', 'content': 0.03715752810239792, 'timestamp': '2025-09-15 03:21:23.615062', 'step': 2388, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:23.646027', 'step': 2388, 'epoch': 3} {'type': 'loss', 'content': 0.05452625826001167, 'timestamp': '2025-09-15 03:21:23.648252', 'step': 2389, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:23.678368', 'step': 2389, 'epoch': 3} {'type': 'loss', 'content': 0.0011189163196831942, 'timestamp': '2025-09-15 03:21:23.680893', 'step': 2390, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:23.714005', 'step': 2390, 'epoch': 3} {'type': 'loss', 'content': 0.02448815479874611, 'timestamp': '2025-09-15 03:21:23.716919', 'step': 2391, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:23.747137', 'step': 2391, 'epoch': 3} {'type': 'loss', 'content': 0.020943596959114075, 'timestamp': '2025-09-15 03:21:23.770523', 'step': 2392, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:23.801773', 'step': 2392, 'epoch': 3} {'type': 'loss', 'content': 0.022623786702752113, 'timestamp': '2025-09-15 03:21:23.803869', 'step': 2393, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:23.833998', 'step': 2393, 'epoch': 3} {'type': 'loss', 'content': 0.00040333118522539735, 'timestamp': '2025-09-15 03:21:23.836414', 'step': 2394, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:21:24.564591', 'step': 2394, 'epoch': 3} {'type': 'pplx', 'content': 65587889.72230218, 'timestamp': '2025-09-15 03:21:24.566516', 'step': 2394, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:24.594840', 'step': 2394, 'epoch': 3} {'type': 'loss', 'content': 0.0008957642712630332, 'timestamp': '2025-09-15 03:21:24.596892', 'step': 2395, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:24.626731', 'step': 2395, 'epoch': 3} {'type': 'loss', 'content': 0.001257409923709929, 'timestamp': '2025-09-15 03:21:24.651088', 'step': 2396, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:24.681378', 'step': 2396, 'epoch': 3} {'type': 'loss', 'content': 0.014478609897196293, 'timestamp': '2025-09-15 03:21:24.683456', 'step': 2397, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:24.715105', 'step': 2397, 'epoch': 3} {'type': 'loss', 'content': 0.01863393373787403, 'timestamp': '2025-09-15 03:21:24.717097', 'step': 2398, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:24.747328', 'step': 2398, 'epoch': 3} {'type': 'loss', 'content': 0.03859979286789894, 'timestamp': '2025-09-15 03:21:24.749398', 'step': 2399, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:24.779655', 'step': 2399, 'epoch': 3} {'type': 'loss', 'content': 0.035878926515579224, 'timestamp': '2025-09-15 03:21:24.803414', 'step': 2400, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:24.833484', 'step': 2400, 'epoch': 3} {'type': 'loss', 'content': 0.002484408440068364, 'timestamp': '2025-09-15 03:21:24.835694', 'step': 2401, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:21:24.866337', 'step': 2401, 'epoch': 3} {'type': 'loss', 'content': 0.03723729029297829, 'timestamp': '2025-09-15 03:21:24.869018', 'step': 2402, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:24.899566', 'step': 2402, 'epoch': 3} {'type': 'loss', 'content': 0.007320962380617857, 'timestamp': '2025-09-15 03:21:24.901597', 'step': 2403, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:24.931335', 'step': 2403, 'epoch': 3} {'type': 'loss', 'content': 0.009379333816468716, 'timestamp': '2025-09-15 03:21:24.955085', 'step': 2404, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:24.985436', 'step': 2404, 'epoch': 3} {'type': 'loss', 'content': 0.002660617232322693, 'timestamp': '2025-09-15 03:21:24.987405', 'step': 2405, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:25.018379', 'step': 2405, 'epoch': 3} {'type': 'loss', 'content': 0.0032046171836555004, 'timestamp': '2025-09-15 03:21:25.020380', 'step': 2406, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:25.050837', 'step': 2406, 'epoch': 3} {'type': 'loss', 'content': 0.01012934185564518, 'timestamp': '2025-09-15 03:21:25.052941', 'step': 2407, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:25.083056', 'step': 2407, 'epoch': 3} {'type': 'loss', 'content': 0.010252327658236027, 'timestamp': '2025-09-15 03:21:25.106652', 'step': 2408, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:25.137235', 'step': 2408, 'epoch': 3} {'type': 'loss', 'content': 0.006697942037135363, 'timestamp': '2025-09-15 03:21:25.139231', 'step': 2409, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:25.169674', 'step': 2409, 'epoch': 3} {'type': 'loss', 'content': 0.020396556705236435, 'timestamp': '2025-09-15 03:21:25.172033', 'step': 2410, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:25.202155', 'step': 2410, 'epoch': 3} {'type': 'loss', 'content': 0.010722288861870766, 'timestamp': '2025-09-15 03:21:25.204307', 'step': 2411, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:21:25.234498', 'step': 2411, 'epoch': 3} {'type': 'loss', 'content': 0.022731030359864235, 'timestamp': '2025-09-15 03:21:25.258449', 'step': 2412, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:25.288794', 'step': 2412, 'epoch': 3} {'type': 'loss', 'content': 0.004771314561367035, 'timestamp': '2025-09-15 03:21:25.290864', 'step': 2413, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:25.320680', 'step': 2413, 'epoch': 3} {'type': 'loss', 'content': 0.00403338298201561, 'timestamp': '2025-09-15 03:21:25.322721', 'step': 2414, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:25.353003', 'step': 2414, 'epoch': 3} {'type': 'loss', 'content': 0.005938275717198849, 'timestamp': '2025-09-15 03:21:25.355364', 'step': 2415, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:25.385657', 'step': 2415, 'epoch': 3} {'type': 'loss', 'content': 0.0006778505048714578, 'timestamp': '2025-09-15 03:21:25.409300', 'step': 2416, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:25.439782', 'step': 2416, 'epoch': 3} {'type': 'loss', 'content': 0.007004484534263611, 'timestamp': '2025-09-15 03:21:25.442040', 'step': 2417, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:25.472455', 'step': 2417, 'epoch': 3} {'type': 'loss', 'content': 0.008051480166614056, 'timestamp': '2025-09-15 03:21:25.474815', 'step': 2418, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:25.505229', 'step': 2418, 'epoch': 3} {'type': 'loss', 'content': 0.015677032992243767, 'timestamp': '2025-09-15 03:21:25.507344', 'step': 2419, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:25.538006', 'step': 2419, 'epoch': 3} {'type': 'loss', 'content': 0.0012792375637218356, 'timestamp': '2025-09-15 03:21:25.562179', 'step': 2420, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:25.592334', 'step': 2420, 'epoch': 3} {'type': 'loss', 'content': 0.000992942019365728, 'timestamp': '2025-09-15 03:21:25.594510', 'step': 2421, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:25.625590', 'step': 2421, 'epoch': 3} {'type': 'loss', 'content': 0.04845261946320534, 'timestamp': '2025-09-15 03:21:25.627798', 'step': 2422, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:25.657733', 'step': 2422, 'epoch': 3} {'type': 'loss', 'content': 0.031112248077988625, 'timestamp': '2025-09-15 03:21:25.659871', 'step': 2423, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:25.690334', 'step': 2423, 'epoch': 3} {'type': 'loss', 'content': 0.011084296740591526, 'timestamp': '2025-09-15 03:21:25.713847', 'step': 2424, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 176], 'flops': 5220903722048}, 'timestamp': '2025-09-15 03:21:25.744747', 'step': 2424, 'epoch': 3} {'type': 'loss', 'content': 0.019527485594153404, 'timestamp': '2025-09-15 03:21:25.746744', 'step': 2425, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:25.777085', 'step': 2425, 'epoch': 3} {'type': 'loss', 'content': 0.007705447729676962, 'timestamp': '2025-09-15 03:21:25.779132', 'step': 2426, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:25.808627', 'step': 2426, 'epoch': 3} {'type': 'loss', 'content': 0.01861448585987091, 'timestamp': '2025-09-15 03:21:25.810926', 'step': 2427, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:25.841556', 'step': 2427, 'epoch': 3} {'type': 'loss', 'content': 0.0005472367047332227, 'timestamp': '2025-09-15 03:21:25.866135', 'step': 2428, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:25.896651', 'step': 2428, 'epoch': 3} {'type': 'loss', 'content': 0.009006543084979057, 'timestamp': '2025-09-15 03:21:25.898930', 'step': 2429, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:25.929182', 'step': 2429, 'epoch': 3} {'type': 'loss', 'content': 0.018193485215306282, 'timestamp': '2025-09-15 03:21:25.931260', 'step': 2430, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:25.961692', 'step': 2430, 'epoch': 3} {'type': 'loss', 'content': 0.010625666007399559, 'timestamp': '2025-09-15 03:21:25.963980', 'step': 2431, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:25.993978', 'step': 2431, 'epoch': 3} {'type': 'loss', 'content': 0.013900967314839363, 'timestamp': '2025-09-15 03:21:26.017580', 'step': 2432, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:26.048407', 'step': 2432, 'epoch': 3} {'type': 'loss', 'content': 0.011567777954041958, 'timestamp': '2025-09-15 03:21:26.050465', 'step': 2433, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:26.081193', 'step': 2433, 'epoch': 3} {'type': 'loss', 'content': 0.017127785831689835, 'timestamp': '2025-09-15 03:21:26.083207', 'step': 2434, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:26.113332', 'step': 2434, 'epoch': 3} {'type': 'loss', 'content': 0.007701248396188021, 'timestamp': '2025-09-15 03:21:26.115405', 'step': 2435, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:26.145657', 'step': 2435, 'epoch': 3} {'type': 'loss', 'content': 0.012019234709441662, 'timestamp': '2025-09-15 03:21:26.169875', 'step': 2436, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:26.200148', 'step': 2436, 'epoch': 3} {'type': 'loss', 'content': 0.01517223659902811, 'timestamp': '2025-09-15 03:21:26.202118', 'step': 2437, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:26.232074', 'step': 2437, 'epoch': 3} {'type': 'loss', 'content': 0.01354182232171297, 'timestamp': '2025-09-15 03:21:26.234436', 'step': 2438, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:26.264584', 'step': 2438, 'epoch': 3} {'type': 'loss', 'content': 0.008238118141889572, 'timestamp': '2025-09-15 03:21:26.266934', 'step': 2439, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:26.297598', 'step': 2439, 'epoch': 3} {'type': 'loss', 'content': 0.009475002065300941, 'timestamp': '2025-09-15 03:21:26.321399', 'step': 2440, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:26.352095', 'step': 2440, 'epoch': 3} {'type': 'loss', 'content': 0.009594145230948925, 'timestamp': '2025-09-15 03:21:26.354087', 'step': 2441, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:26.384631', 'step': 2441, 'epoch': 3} {'type': 'loss', 'content': 0.0014280823525041342, 'timestamp': '2025-09-15 03:21:26.386828', 'step': 2442, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:26.417928', 'step': 2442, 'epoch': 3} {'type': 'loss', 'content': 0.0034047921653836966, 'timestamp': '2025-09-15 03:21:26.419964', 'step': 2443, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:26.450878', 'step': 2443, 'epoch': 3} {'type': 'loss', 'content': 0.006948020774871111, 'timestamp': '2025-09-15 03:21:26.474651', 'step': 2444, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:26.507541', 'step': 2444, 'epoch': 3} {'type': 'loss', 'content': 0.015213142149150372, 'timestamp': '2025-09-15 03:21:26.509806', 'step': 2445, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:26.541494', 'step': 2445, 'epoch': 3} {'type': 'loss', 'content': 0.02016303315758705, 'timestamp': '2025-09-15 03:21:26.546937', 'step': 2446, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:26.579146', 'step': 2446, 'epoch': 3} {'type': 'loss', 'content': 0.01299526821821928, 'timestamp': '2025-09-15 03:21:26.581246', 'step': 2447, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:26.613740', 'step': 2447, 'epoch': 3} {'type': 'loss', 'content': 0.0024096709676086903, 'timestamp': '2025-09-15 03:21:26.640995', 'step': 2448, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:26.671754', 'step': 2448, 'epoch': 3} {'type': 'loss', 'content': 0.0309018325060606, 'timestamp': '2025-09-15 03:21:26.673885', 'step': 2449, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:26.706085', 'step': 2449, 'epoch': 3} {'type': 'loss', 'content': 0.014632557518780231, 'timestamp': '2025-09-15 03:21:26.708248', 'step': 2450, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:26.739898', 'step': 2450, 'epoch': 3} {'type': 'loss', 'content': 0.01982324756681919, 'timestamp': '2025-09-15 03:21:26.741997', 'step': 2451, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:21:27.590220', 'step': 2451, 'epoch': 3} {'type': 'pplx', 'content': 68557706.19116534, 'timestamp': '2025-09-15 03:21:27.592181', 'step': 2451, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:27.621170', 'step': 2451, 'epoch': 3} {'type': 'loss', 'content': 0.009094307199120522, 'timestamp': '2025-09-15 03:21:27.648167', 'step': 2452, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:27.680986', 'step': 2452, 'epoch': 3} {'type': 'loss', 'content': 0.011998111382126808, 'timestamp': '2025-09-15 03:21:27.683238', 'step': 2453, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:27.714151', 'step': 2453, 'epoch': 3} {'type': 'loss', 'content': 0.00863865576684475, 'timestamp': '2025-09-15 03:21:27.716394', 'step': 2454, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:27.749598', 'step': 2454, 'epoch': 3} {'type': 'loss', 'content': 0.035453762859106064, 'timestamp': '2025-09-15 03:21:27.751770', 'step': 2455, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:27.782656', 'step': 2455, 'epoch': 3} {'type': 'loss', 'content': 0.0025879372842609882, 'timestamp': '2025-09-15 03:21:27.806266', 'step': 2456, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:27.837289', 'step': 2456, 'epoch': 3} {'type': 'loss', 'content': 0.0030494672246277332, 'timestamp': '2025-09-15 03:21:27.841544', 'step': 2457, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:27.877047', 'step': 2457, 'epoch': 3} {'type': 'loss', 'content': 0.009609291329979897, 'timestamp': '2025-09-15 03:21:27.879293', 'step': 2458, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:27.911685', 'step': 2458, 'epoch': 3} {'type': 'loss', 'content': 0.008864649571478367, 'timestamp': '2025-09-15 03:21:27.913781', 'step': 2459, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:27.945302', 'step': 2459, 'epoch': 3} {'type': 'loss', 'content': 0.0009798811515793204, 'timestamp': '2025-09-15 03:21:27.973847', 'step': 2460, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:28.004516', 'step': 2460, 'epoch': 3} {'type': 'loss', 'content': 0.004546779673546553, 'timestamp': '2025-09-15 03:21:28.006672', 'step': 2461, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:28.038157', 'step': 2461, 'epoch': 3} {'type': 'loss', 'content': 0.010944445617496967, 'timestamp': '2025-09-15 03:21:28.040477', 'step': 2462, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:28.071077', 'step': 2462, 'epoch': 3} {'type': 'loss', 'content': 0.028137998655438423, 'timestamp': '2025-09-15 03:21:28.074083', 'step': 2463, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:28.109670', 'step': 2463, 'epoch': 3} {'type': 'loss', 'content': 0.04601803421974182, 'timestamp': '2025-09-15 03:21:28.133440', 'step': 2464, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:28.165002', 'step': 2464, 'epoch': 3} {'type': 'loss', 'content': 0.007286368403583765, 'timestamp': '2025-09-15 03:21:28.167538', 'step': 2465, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:28.206360', 'step': 2465, 'epoch': 3} {'type': 'loss', 'content': 0.012363625690340996, 'timestamp': '2025-09-15 03:21:28.209071', 'step': 2466, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:28.241590', 'step': 2466, 'epoch': 3} {'type': 'loss', 'content': 0.007483890745788813, 'timestamp': '2025-09-15 03:21:28.243840', 'step': 2467, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:28.275003', 'step': 2467, 'epoch': 3} {'type': 'loss', 'content': 0.021944841369986534, 'timestamp': '2025-09-15 03:21:28.300640', 'step': 2468, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:28.332649', 'step': 2468, 'epoch': 3} {'type': 'loss', 'content': 0.01066376268863678, 'timestamp': '2025-09-15 03:21:28.334878', 'step': 2469, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:28.365115', 'step': 2469, 'epoch': 3} {'type': 'loss', 'content': 0.007398658897727728, 'timestamp': '2025-09-15 03:21:28.367525', 'step': 2470, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:28.402939', 'step': 2470, 'epoch': 3} {'type': 'loss', 'content': 0.0008656633435748518, 'timestamp': '2025-09-15 03:21:28.407339', 'step': 2471, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:28.449135', 'step': 2471, 'epoch': 3} {'type': 'loss', 'content': 0.006925874389708042, 'timestamp': '2025-09-15 03:21:28.472859', 'step': 2472, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:28.503149', 'step': 2472, 'epoch': 3} {'type': 'loss', 'content': 0.02251579985022545, 'timestamp': '2025-09-15 03:21:28.505242', 'step': 2473, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:28.535784', 'step': 2473, 'epoch': 3} {'type': 'loss', 'content': 0.00035866329562850296, 'timestamp': '2025-09-15 03:21:28.537832', 'step': 2474, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:28.568261', 'step': 2474, 'epoch': 3} {'type': 'loss', 'content': 0.0008645497146062553, 'timestamp': '2025-09-15 03:21:28.578513', 'step': 2475, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:28.611000', 'step': 2475, 'epoch': 3} {'type': 'loss', 'content': 0.006888339761644602, 'timestamp': '2025-09-15 03:21:28.634600', 'step': 2476, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:28.665409', 'step': 2476, 'epoch': 3} {'type': 'loss', 'content': 0.0005782764637842774, 'timestamp': '2025-09-15 03:21:28.667861', 'step': 2477, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:28.709237', 'step': 2477, 'epoch': 3} {'type': 'loss', 'content': 0.022311091423034668, 'timestamp': '2025-09-15 03:21:28.716463', 'step': 2478, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:28.747526', 'step': 2478, 'epoch': 3} {'type': 'loss', 'content': 0.0008715793373994529, 'timestamp': '2025-09-15 03:21:28.749803', 'step': 2479, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:28.780909', 'step': 2479, 'epoch': 3} {'type': 'loss', 'content': 0.02877902425825596, 'timestamp': '2025-09-15 03:21:28.804379', 'step': 2480, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:28.834846', 'step': 2480, 'epoch': 3} {'type': 'loss', 'content': 0.0014250698732212186, 'timestamp': '2025-09-15 03:21:28.837078', 'step': 2481, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:28.868960', 'step': 2481, 'epoch': 3} {'type': 'loss', 'content': 0.00030158410663716495, 'timestamp': '2025-09-15 03:21:28.871467', 'step': 2482, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:28.901685', 'step': 2482, 'epoch': 3} {'type': 'loss', 'content': 0.00030347550637088716, 'timestamp': '2025-09-15 03:21:28.903831', 'step': 2483, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:28.934126', 'step': 2483, 'epoch': 3} {'type': 'loss', 'content': 0.00023485065321438015, 'timestamp': '2025-09-15 03:21:28.957670', 'step': 2484, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:28.990306', 'step': 2484, 'epoch': 3} {'type': 'loss', 'content': 0.029425393790006638, 'timestamp': '2025-09-15 03:21:28.992656', 'step': 2485, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:29.024933', 'step': 2485, 'epoch': 3} {'type': 'loss', 'content': 0.006918161641806364, 'timestamp': '2025-09-15 03:21:29.027215', 'step': 2486, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:29.057954', 'step': 2486, 'epoch': 3} {'type': 'loss', 'content': 0.006680171005427837, 'timestamp': '2025-09-15 03:21:29.060481', 'step': 2487, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:29.091190', 'step': 2487, 'epoch': 3} {'type': 'loss', 'content': 0.00960888247936964, 'timestamp': '2025-09-15 03:21:29.114762', 'step': 2488, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:29.147338', 'step': 2488, 'epoch': 3} {'type': 'loss', 'content': 0.0015458361012861133, 'timestamp': '2025-09-15 03:21:29.149530', 'step': 2489, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:29.180205', 'step': 2489, 'epoch': 3} {'type': 'loss', 'content': 0.0001561766694067046, 'timestamp': '2025-09-15 03:21:29.182478', 'step': 2490, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:29.213049', 'step': 2490, 'epoch': 3} {'type': 'loss', 'content': 8.076949598034844e-05, 'timestamp': '2025-09-15 03:21:29.218111', 'step': 2491, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:29.251496', 'step': 2491, 'epoch': 3} {'type': 'loss', 'content': 0.03969201073050499, 'timestamp': '2025-09-15 03:21:29.275086', 'step': 2492, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:29.306324', 'step': 2492, 'epoch': 3} {'type': 'loss', 'content': 0.00016137374041136354, 'timestamp': '2025-09-15 03:21:29.308586', 'step': 2493, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:29.339007', 'step': 2493, 'epoch': 3} {'type': 'loss', 'content': 0.0026815198361873627, 'timestamp': '2025-09-15 03:21:29.341289', 'step': 2494, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:29.373253', 'step': 2494, 'epoch': 3} {'type': 'loss', 'content': 0.015436379238963127, 'timestamp': '2025-09-15 03:21:29.375440', 'step': 2495, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:29.405720', 'step': 2495, 'epoch': 3} {'type': 'loss', 'content': 0.00012169930414529517, 'timestamp': '2025-09-15 03:21:29.429266', 'step': 2496, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:29.459466', 'step': 2496, 'epoch': 3} {'type': 'loss', 'content': 0.017209528014063835, 'timestamp': '2025-09-15 03:21:29.461950', 'step': 2497, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:29.493119', 'step': 2497, 'epoch': 3} {'type': 'loss', 'content': 0.002141644014045596, 'timestamp': '2025-09-15 03:21:29.495513', 'step': 2498, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:29.528016', 'step': 2498, 'epoch': 3} {'type': 'loss', 'content': 0.01186156552284956, 'timestamp': '2025-09-15 03:21:29.530376', 'step': 2499, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:29.561450', 'step': 2499, 'epoch': 3} {'type': 'loss', 'content': 0.0002478655078448355, 'timestamp': '2025-09-15 03:21:29.585122', 'step': 2500, 'epoch': 3} {'type': 'info', 'content': 'Checkpoint saved at step 2500', 'timestamp': '2025-09-15 03:21:35.884158', 'step': 2500, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:35.930457', 'step': 2500, 'epoch': 3} {'type': 'loss', 'content': 0.0007553789764642715, 'timestamp': '2025-09-15 03:21:35.932658', 'step': 2501, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:35.963635', 'step': 2501, 'epoch': 3} {'type': 'loss', 'content': 0.0018034816021099687, 'timestamp': '2025-09-15 03:21:35.965881', 'step': 2502, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:35.996624', 'step': 2502, 'epoch': 3} {'type': 'loss', 'content': 0.013548840768635273, 'timestamp': '2025-09-15 03:21:35.998793', 'step': 2503, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:36.029466', 'step': 2503, 'epoch': 3} {'type': 'loss', 'content': 0.03326854854822159, 'timestamp': '2025-09-15 03:21:36.053312', 'step': 2504, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:36.083613', 'step': 2504, 'epoch': 3} {'type': 'loss', 'content': 0.0012910410296171904, 'timestamp': '2025-09-15 03:21:36.085624', 'step': 2505, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:36.115702', 'step': 2505, 'epoch': 3} {'type': 'loss', 'content': 0.001528589054942131, 'timestamp': '2025-09-15 03:21:36.117864', 'step': 2506, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:36.148799', 'step': 2506, 'epoch': 3} {'type': 'loss', 'content': 0.0031019821763038635, 'timestamp': '2025-09-15 03:21:36.153507', 'step': 2507, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:36.185322', 'step': 2507, 'epoch': 3} {'type': 'loss', 'content': 0.026747560128569603, 'timestamp': '2025-09-15 03:21:36.212291', 'step': 2508, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:21:36.966405', 'step': 2508, 'epoch': 3} {'type': 'pplx', 'content': 43281895.896381795, 'timestamp': '2025-09-15 03:21:36.968451', 'step': 2508, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:36.997379', 'step': 2508, 'epoch': 3} {'type': 'loss', 'content': 0.02395753376185894, 'timestamp': '2025-09-15 03:21:36.999756', 'step': 2509, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:37.030864', 'step': 2509, 'epoch': 3} {'type': 'loss', 'content': 0.001957479165866971, 'timestamp': '2025-09-15 03:21:37.033173', 'step': 2510, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:37.064203', 'step': 2510, 'epoch': 3} {'type': 'loss', 'content': 0.002216178458184004, 'timestamp': '2025-09-15 03:21:37.066566', 'step': 2511, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:37.098731', 'step': 2511, 'epoch': 3} {'type': 'loss', 'content': 0.008040383458137512, 'timestamp': '2025-09-15 03:21:37.122489', 'step': 2512, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:37.153206', 'step': 2512, 'epoch': 3} {'type': 'loss', 'content': 0.006515815854072571, 'timestamp': '2025-09-15 03:21:37.155383', 'step': 2513, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:37.185781', 'step': 2513, 'epoch': 3} {'type': 'loss', 'content': 0.0028326960746198893, 'timestamp': '2025-09-15 03:21:37.188124', 'step': 2514, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:37.218547', 'step': 2514, 'epoch': 3} {'type': 'loss', 'content': 0.007095829583704472, 'timestamp': '2025-09-15 03:21:37.220693', 'step': 2515, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:37.251125', 'step': 2515, 'epoch': 3} {'type': 'loss', 'content': 0.012786582112312317, 'timestamp': '2025-09-15 03:21:37.274672', 'step': 2516, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:37.305225', 'step': 2516, 'epoch': 3} {'type': 'loss', 'content': 0.027167638763785362, 'timestamp': '2025-09-15 03:21:37.307299', 'step': 2517, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:37.338024', 'step': 2517, 'epoch': 3} {'type': 'loss', 'content': 0.005034402012825012, 'timestamp': '2025-09-15 03:21:37.340126', 'step': 2518, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:37.370443', 'step': 2518, 'epoch': 3} {'type': 'loss', 'content': 0.030000852420926094, 'timestamp': '2025-09-15 03:21:37.372524', 'step': 2519, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:37.403080', 'step': 2519, 'epoch': 3} {'type': 'loss', 'content': 0.006657686084508896, 'timestamp': '2025-09-15 03:21:37.426596', 'step': 2520, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:37.457937', 'step': 2520, 'epoch': 3} {'type': 'loss', 'content': 0.01242148783057928, 'timestamp': '2025-09-15 03:21:37.460252', 'step': 2521, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:37.491338', 'step': 2521, 'epoch': 3} {'type': 'loss', 'content': 0.0017653640825301409, 'timestamp': '2025-09-15 03:21:37.493498', 'step': 2522, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:37.523332', 'step': 2522, 'epoch': 3} {'type': 'loss', 'content': 0.05272073671221733, 'timestamp': '2025-09-15 03:21:37.525466', 'step': 2523, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:37.555762', 'step': 2523, 'epoch': 3} {'type': 'loss', 'content': 0.022209199145436287, 'timestamp': '2025-09-15 03:21:37.579357', 'step': 2524, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:37.611390', 'step': 2524, 'epoch': 3} {'type': 'loss', 'content': 0.001376944943331182, 'timestamp': '2025-09-15 03:21:37.613464', 'step': 2525, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:37.644050', 'step': 2525, 'epoch': 3} {'type': 'loss', 'content': 0.004452253691852093, 'timestamp': '2025-09-15 03:21:37.646248', 'step': 2526, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:37.676967', 'step': 2526, 'epoch': 3} {'type': 'loss', 'content': 0.00443139998242259, 'timestamp': '2025-09-15 03:21:37.679186', 'step': 2527, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:37.710003', 'step': 2527, 'epoch': 3} {'type': 'loss', 'content': 0.01706121675670147, 'timestamp': '2025-09-15 03:21:37.733427', 'step': 2528, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:37.764700', 'step': 2528, 'epoch': 3} {'type': 'loss', 'content': 0.0020160474814474583, 'timestamp': '2025-09-15 03:21:37.766793', 'step': 2529, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:37.797458', 'step': 2529, 'epoch': 3} {'type': 'loss', 'content': 0.00645886966958642, 'timestamp': '2025-09-15 03:21:37.799619', 'step': 2530, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:37.829867', 'step': 2530, 'epoch': 3} {'type': 'loss', 'content': 0.004868703428655863, 'timestamp': '2025-09-15 03:21:37.832096', 'step': 2531, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:37.862932', 'step': 2531, 'epoch': 3} {'type': 'loss', 'content': 0.0013749711215496063, 'timestamp': '2025-09-15 03:21:37.886649', 'step': 2532, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:37.919169', 'step': 2532, 'epoch': 3} {'type': 'loss', 'content': 0.01655150018632412, 'timestamp': '2025-09-15 03:21:37.921663', 'step': 2533, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:37.951927', 'step': 2533, 'epoch': 3} {'type': 'loss', 'content': 0.010359219275414944, 'timestamp': '2025-09-15 03:21:37.954587', 'step': 2534, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:37.984742', 'step': 2534, 'epoch': 3} {'type': 'loss', 'content': 0.0017317746533080935, 'timestamp': '2025-09-15 03:21:37.986790', 'step': 2535, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:38.017483', 'step': 2535, 'epoch': 3} {'type': 'loss', 'content': 0.002996358321979642, 'timestamp': '2025-09-15 03:21:38.040977', 'step': 2536, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:38.071618', 'step': 2536, 'epoch': 3} {'type': 'loss', 'content': 0.006995683070272207, 'timestamp': '2025-09-15 03:21:38.073837', 'step': 2537, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:38.104907', 'step': 2537, 'epoch': 3} {'type': 'loss', 'content': 0.008663954213261604, 'timestamp': '2025-09-15 03:21:38.106977', 'step': 2538, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:38.137751', 'step': 2538, 'epoch': 3} {'type': 'loss', 'content': 0.011092414148151875, 'timestamp': '2025-09-15 03:21:38.139837', 'step': 2539, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:38.170667', 'step': 2539, 'epoch': 3} {'type': 'loss', 'content': 0.00014628804638050497, 'timestamp': '2025-09-15 03:21:38.194307', 'step': 2540, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:38.225229', 'step': 2540, 'epoch': 3} {'type': 'loss', 'content': 0.00014245144848246127, 'timestamp': '2025-09-15 03:21:38.227552', 'step': 2541, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:38.260307', 'step': 2541, 'epoch': 3} {'type': 'loss', 'content': 0.0020488626323640347, 'timestamp': '2025-09-15 03:21:38.262358', 'step': 2542, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:38.293162', 'step': 2542, 'epoch': 3} {'type': 'loss', 'content': 0.002959874924272299, 'timestamp': '2025-09-15 03:21:38.295284', 'step': 2543, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:38.325618', 'step': 2543, 'epoch': 3} {'type': 'loss', 'content': 0.02810373343527317, 'timestamp': '2025-09-15 03:21:38.349107', 'step': 2544, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:21:38.379939', 'step': 2544, 'epoch': 3} {'type': 'loss', 'content': 0.001163217588327825, 'timestamp': '2025-09-15 03:21:38.382234', 'step': 2545, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:38.412962', 'step': 2545, 'epoch': 3} {'type': 'loss', 'content': 0.0035799711477011442, 'timestamp': '2025-09-15 03:21:38.415091', 'step': 2546, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:38.446458', 'step': 2546, 'epoch': 3} {'type': 'loss', 'content': 0.0019874710123986006, 'timestamp': '2025-09-15 03:21:38.448666', 'step': 2547, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:38.480161', 'step': 2547, 'epoch': 3} {'type': 'loss', 'content': 0.0012459383578971028, 'timestamp': '2025-09-15 03:21:38.503874', 'step': 2548, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:38.534927', 'step': 2548, 'epoch': 3} {'type': 'loss', 'content': 0.010002275928854942, 'timestamp': '2025-09-15 03:21:38.537134', 'step': 2549, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:38.567964', 'step': 2549, 'epoch': 3} {'type': 'loss', 'content': 0.016585873439908028, 'timestamp': '2025-09-15 03:21:38.570108', 'step': 2550, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:38.601069', 'step': 2550, 'epoch': 3} {'type': 'loss', 'content': 0.02941265143454075, 'timestamp': '2025-09-15 03:21:38.604959', 'step': 2551, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:38.635337', 'step': 2551, 'epoch': 3} {'type': 'loss', 'content': 0.006005280185490847, 'timestamp': '2025-09-15 03:21:38.659938', 'step': 2552, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:38.692205', 'step': 2552, 'epoch': 3} {'type': 'loss', 'content': 0.0008412003517150879, 'timestamp': '2025-09-15 03:21:38.694480', 'step': 2553, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:38.724620', 'step': 2553, 'epoch': 3} {'type': 'loss', 'content': 0.03075595758855343, 'timestamp': '2025-09-15 03:21:38.727017', 'step': 2554, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:38.757887', 'step': 2554, 'epoch': 3} {'type': 'loss', 'content': 5.689586396329105e-05, 'timestamp': '2025-09-15 03:21:38.760115', 'step': 2555, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:38.791445', 'step': 2555, 'epoch': 3} {'type': 'loss', 'content': 0.000128539526485838, 'timestamp': '2025-09-15 03:21:38.815082', 'step': 2556, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:38.846010', 'step': 2556, 'epoch': 3} {'type': 'loss', 'content': 0.0015599167672917247, 'timestamp': '2025-09-15 03:21:38.848241', 'step': 2557, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:38.878543', 'step': 2557, 'epoch': 3} {'type': 'loss', 'content': 0.030791833996772766, 'timestamp': '2025-09-15 03:21:38.880768', 'step': 2558, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:38.911233', 'step': 2558, 'epoch': 3} {'type': 'loss', 'content': 0.007090753875672817, 'timestamp': '2025-09-15 03:21:38.913565', 'step': 2559, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:38.944558', 'step': 2559, 'epoch': 3} {'type': 'loss', 'content': 0.0018526233034208417, 'timestamp': '2025-09-15 03:21:38.968024', 'step': 2560, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:38.998581', 'step': 2560, 'epoch': 3} {'type': 'loss', 'content': 0.0008933954522944987, 'timestamp': '2025-09-15 03:21:39.000671', 'step': 2561, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:39.031172', 'step': 2561, 'epoch': 3} {'type': 'loss', 'content': 0.00965001992881298, 'timestamp': '2025-09-15 03:21:39.033302', 'step': 2562, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:39.063769', 'step': 2562, 'epoch': 3} {'type': 'loss', 'content': 0.0011859569931402802, 'timestamp': '2025-09-15 03:21:39.065844', 'step': 2563, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:39.096584', 'step': 2563, 'epoch': 3} {'type': 'loss', 'content': 0.003663228126242757, 'timestamp': '2025-09-15 03:21:39.120260', 'step': 2564, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:39.150962', 'step': 2564, 'epoch': 3} {'type': 'loss', 'content': 0.0022224951535463333, 'timestamp': '2025-09-15 03:21:39.153033', 'step': 2565, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:21:39.883862', 'step': 2565, 'epoch': 3} {'type': 'pplx', 'content': 45366570.01248686, 'timestamp': '2025-09-15 03:21:39.885759', 'step': 2565, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:39.914331', 'step': 2565, 'epoch': 3} {'type': 'loss', 'content': 0.006390959955751896, 'timestamp': '2025-09-15 03:21:39.916645', 'step': 2566, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:39.947894', 'step': 2566, 'epoch': 3} {'type': 'loss', 'content': 0.0008488258463330567, 'timestamp': '2025-09-15 03:21:39.950081', 'step': 2567, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:39.982017', 'step': 2567, 'epoch': 3} {'type': 'loss', 'content': 0.0018424444133415818, 'timestamp': '2025-09-15 03:21:40.005688', 'step': 2568, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:40.036536', 'step': 2568, 'epoch': 3} {'type': 'loss', 'content': 0.0006706213462166488, 'timestamp': '2025-09-15 03:21:40.038931', 'step': 2569, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:40.069590', 'step': 2569, 'epoch': 3} {'type': 'loss', 'content': 0.012076971121132374, 'timestamp': '2025-09-15 03:21:40.071622', 'step': 2570, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:40.103073', 'step': 2570, 'epoch': 3} {'type': 'loss', 'content': 0.004673096816986799, 'timestamp': '2025-09-15 03:21:40.107358', 'step': 2571, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:40.140778', 'step': 2571, 'epoch': 3} {'type': 'loss', 'content': 0.002782629569992423, 'timestamp': '2025-09-15 03:21:40.164974', 'step': 2572, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:40.196143', 'step': 2572, 'epoch': 3} {'type': 'loss', 'content': 0.000997567898593843, 'timestamp': '2025-09-15 03:21:40.198180', 'step': 2573, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:40.228489', 'step': 2573, 'epoch': 3} {'type': 'loss', 'content': 0.005449178162962198, 'timestamp': '2025-09-15 03:21:40.230719', 'step': 2574, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:40.262528', 'step': 2574, 'epoch': 3} {'type': 'loss', 'content': 0.004305838141590357, 'timestamp': '2025-09-15 03:21:40.264752', 'step': 2575, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:40.295855', 'step': 2575, 'epoch': 3} {'type': 'loss', 'content': 0.03190882131457329, 'timestamp': '2025-09-15 03:21:40.319488', 'step': 2576, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:40.349814', 'step': 2576, 'epoch': 3} {'type': 'loss', 'content': 0.0013857169542461634, 'timestamp': '2025-09-15 03:21:40.351941', 'step': 2577, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:40.387765', 'step': 2577, 'epoch': 3} {'type': 'loss', 'content': 0.003061884781345725, 'timestamp': '2025-09-15 03:21:40.391136', 'step': 2578, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:40.424284', 'step': 2578, 'epoch': 3} {'type': 'loss', 'content': 0.004191776271909475, 'timestamp': '2025-09-15 03:21:40.428273', 'step': 2579, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:40.458953', 'step': 2579, 'epoch': 3} {'type': 'loss', 'content': 0.008370229043066502, 'timestamp': '2025-09-15 03:21:40.482653', 'step': 2580, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:40.534777', 'step': 2580, 'epoch': 3} {'type': 'loss', 'content': 0.0011418802896514535, 'timestamp': '2025-09-15 03:21:40.537012', 'step': 2581, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:40.567881', 'step': 2581, 'epoch': 3} {'type': 'loss', 'content': 0.003366305958479643, 'timestamp': '2025-09-15 03:21:40.570250', 'step': 2582, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:40.601645', 'step': 2582, 'epoch': 3} {'type': 'loss', 'content': 0.001745986519381404, 'timestamp': '2025-09-15 03:21:40.606397', 'step': 2583, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:40.637319', 'step': 2583, 'epoch': 3} {'type': 'loss', 'content': 0.0017125660087913275, 'timestamp': '2025-09-15 03:21:40.660787', 'step': 2584, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:40.692077', 'step': 2584, 'epoch': 3} {'type': 'loss', 'content': 0.0020126174204051495, 'timestamp': '2025-09-15 03:21:40.694163', 'step': 2585, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:40.732749', 'step': 2585, 'epoch': 3} {'type': 'loss', 'content': 0.031251564621925354, 'timestamp': '2025-09-15 03:21:40.734854', 'step': 2586, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:40.765176', 'step': 2586, 'epoch': 3} {'type': 'loss', 'content': 0.0018158911261707544, 'timestamp': '2025-09-15 03:21:40.767609', 'step': 2587, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:40.798034', 'step': 2587, 'epoch': 3} {'type': 'loss', 'content': 0.002623507520183921, 'timestamp': '2025-09-15 03:21:40.821425', 'step': 2588, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:40.865134', 'step': 2588, 'epoch': 3} {'type': 'loss', 'content': 0.003804678563028574, 'timestamp': '2025-09-15 03:21:40.867921', 'step': 2589, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:40.898236', 'step': 2589, 'epoch': 3} {'type': 'loss', 'content': 0.0029680945444852114, 'timestamp': '2025-09-15 03:21:40.900375', 'step': 2590, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:40.930968', 'step': 2590, 'epoch': 3} {'type': 'loss', 'content': 0.006071859505027533, 'timestamp': '2025-09-15 03:21:40.934342', 'step': 2591, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:40.966265', 'step': 2591, 'epoch': 3} {'type': 'loss', 'content': 0.0016739203128963709, 'timestamp': '2025-09-15 03:21:40.990123', 'step': 2592, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:41.020829', 'step': 2592, 'epoch': 3} {'type': 'loss', 'content': 0.0012125660432502627, 'timestamp': '2025-09-15 03:21:41.022933', 'step': 2593, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:41.052891', 'step': 2593, 'epoch': 3} {'type': 'loss', 'content': 0.0016867046942934394, 'timestamp': '2025-09-15 03:21:41.055090', 'step': 2594, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:41.085592', 'step': 2594, 'epoch': 3} {'type': 'loss', 'content': 0.0024879786651581526, 'timestamp': '2025-09-15 03:21:41.088119', 'step': 2595, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 160], 'flops': 4746299996032}, 'timestamp': '2025-09-15 03:21:41.119045', 'step': 2595, 'epoch': 3} {'type': 'loss', 'content': 0.01448439247906208, 'timestamp': '2025-09-15 03:21:41.142739', 'step': 2596, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:41.172775', 'step': 2596, 'epoch': 3} {'type': 'loss', 'content': 0.00893432553857565, 'timestamp': '2025-09-15 03:21:41.174925', 'step': 2597, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:41.205102', 'step': 2597, 'epoch': 3} {'type': 'loss', 'content': 0.003067398676648736, 'timestamp': '2025-09-15 03:21:41.207196', 'step': 2598, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:41.237417', 'step': 2598, 'epoch': 3} {'type': 'loss', 'content': 0.001361220725812018, 'timestamp': '2025-09-15 03:21:41.239563', 'step': 2599, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:41.270230', 'step': 2599, 'epoch': 3} {'type': 'loss', 'content': 0.0022079572081565857, 'timestamp': '2025-09-15 03:21:41.293871', 'step': 2600, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:41.324734', 'step': 2600, 'epoch': 3} {'type': 'loss', 'content': 0.0025524452794343233, 'timestamp': '2025-09-15 03:21:41.327302', 'step': 2601, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:41.357904', 'step': 2601, 'epoch': 3} {'type': 'loss', 'content': 0.0015212270664051175, 'timestamp': '2025-09-15 03:21:41.359965', 'step': 2602, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:41.390604', 'step': 2602, 'epoch': 3} {'type': 'loss', 'content': 0.00080554757732898, 'timestamp': '2025-09-15 03:21:41.392758', 'step': 2603, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:41.424261', 'step': 2603, 'epoch': 3} {'type': 'loss', 'content': 0.0060416231863200665, 'timestamp': '2025-09-15 03:21:41.447677', 'step': 2604, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:41.478527', 'step': 2604, 'epoch': 3} {'type': 'loss', 'content': 0.008705796673893929, 'timestamp': '2025-09-15 03:21:41.480695', 'step': 2605, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:41.510965', 'step': 2605, 'epoch': 3} {'type': 'loss', 'content': 0.00011078565148636699, 'timestamp': '2025-09-15 03:21:41.513295', 'step': 2606, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:41.543566', 'step': 2606, 'epoch': 3} {'type': 'loss', 'content': 0.031115766614675522, 'timestamp': '2025-09-15 03:21:41.545946', 'step': 2607, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:41.576610', 'step': 2607, 'epoch': 3} {'type': 'loss', 'content': 0.0001265882165171206, 'timestamp': '2025-09-15 03:21:41.600105', 'step': 2608, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:41.631427', 'step': 2608, 'epoch': 3} {'type': 'loss', 'content': 0.0014504080172628164, 'timestamp': '2025-09-15 03:21:41.633831', 'step': 2609, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:41.665216', 'step': 2609, 'epoch': 3} {'type': 'loss', 'content': 0.0015388702740892768, 'timestamp': '2025-09-15 03:21:41.667566', 'step': 2610, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:41.698328', 'step': 2610, 'epoch': 3} {'type': 'loss', 'content': 0.0017330830451101065, 'timestamp': '2025-09-15 03:21:41.700618', 'step': 2611, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:41.730813', 'step': 2611, 'epoch': 3} {'type': 'loss', 'content': 0.0002649607486091554, 'timestamp': '2025-09-15 03:21:41.754623', 'step': 2612, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:41.785691', 'step': 2612, 'epoch': 3} {'type': 'loss', 'content': 0.04479163885116577, 'timestamp': '2025-09-15 03:21:41.787879', 'step': 2613, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:41.818645', 'step': 2613, 'epoch': 3} {'type': 'loss', 'content': 0.00031301137642003596, 'timestamp': '2025-09-15 03:21:41.820748', 'step': 2614, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:41.851131', 'step': 2614, 'epoch': 3} {'type': 'loss', 'content': 0.001099871238693595, 'timestamp': '2025-09-15 03:21:41.853299', 'step': 2615, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:41.884701', 'step': 2615, 'epoch': 3} {'type': 'loss', 'content': 0.0005943683790974319, 'timestamp': '2025-09-15 03:21:41.908528', 'step': 2616, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:41.940357', 'step': 2616, 'epoch': 3} {'type': 'loss', 'content': 0.013975650072097778, 'timestamp': '2025-09-15 03:21:41.942502', 'step': 2617, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:41.973872', 'step': 2617, 'epoch': 3} {'type': 'loss', 'content': 0.00039200225728563964, 'timestamp': '2025-09-15 03:21:41.976310', 'step': 2618, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:42.011271', 'step': 2618, 'epoch': 3} {'type': 'loss', 'content': 9.524152847006917e-05, 'timestamp': '2025-09-15 03:21:42.013447', 'step': 2619, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:42.043877', 'step': 2619, 'epoch': 3} {'type': 'loss', 'content': 2.9366343369474635e-05, 'timestamp': '2025-09-15 03:21:42.067400', 'step': 2620, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:42.098437', 'step': 2620, 'epoch': 3} {'type': 'loss', 'content': 2.4567301807110198e-05, 'timestamp': '2025-09-15 03:21:42.100541', 'step': 2621, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:42.131107', 'step': 2621, 'epoch': 3} {'type': 'loss', 'content': 0.006696456111967564, 'timestamp': '2025-09-15 03:21:42.133346', 'step': 2622, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:21:42.863649', 'step': 2622, 'epoch': 3} {'type': 'pplx', 'content': 60060753.57362444, 'timestamp': '2025-09-15 03:21:42.865790', 'step': 2622, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:42.895445', 'step': 2622, 'epoch': 3} {'type': 'loss', 'content': 0.00233951723203063, 'timestamp': '2025-09-15 03:21:42.897760', 'step': 2623, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:42.928491', 'step': 2623, 'epoch': 3} {'type': 'loss', 'content': 3.5728735383599997e-05, 'timestamp': '2025-09-15 03:21:42.952419', 'step': 2624, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:42.983760', 'step': 2624, 'epoch': 3} {'type': 'loss', 'content': 0.01346229575574398, 'timestamp': '2025-09-15 03:21:42.985857', 'step': 2625, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:43.017087', 'step': 2625, 'epoch': 3} {'type': 'loss', 'content': 0.004266827367246151, 'timestamp': '2025-09-15 03:21:43.019134', 'step': 2626, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:43.049560', 'step': 2626, 'epoch': 3} {'type': 'loss', 'content': 0.0003306028083898127, 'timestamp': '2025-09-15 03:21:43.051705', 'step': 2627, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:43.082198', 'step': 2627, 'epoch': 3} {'type': 'loss', 'content': 0.008768661879003048, 'timestamp': '2025-09-15 03:21:43.105873', 'step': 2628, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:43.136309', 'step': 2628, 'epoch': 3} {'type': 'loss', 'content': 0.00028717468376271427, 'timestamp': '2025-09-15 03:21:43.138277', 'step': 2629, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:43.168707', 'step': 2629, 'epoch': 3} {'type': 'loss', 'content': 0.00015357838128693402, 'timestamp': '2025-09-15 03:21:43.170817', 'step': 2630, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:43.201700', 'step': 2630, 'epoch': 3} {'type': 'loss', 'content': 0.0003812172799371183, 'timestamp': '2025-09-15 03:21:43.204301', 'step': 2631, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:43.235283', 'step': 2631, 'epoch': 3} {'type': 'loss', 'content': 0.011865836568176746, 'timestamp': '2025-09-15 03:21:43.259045', 'step': 2632, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:43.290095', 'step': 2632, 'epoch': 3} {'type': 'loss', 'content': 0.00015511747915297747, 'timestamp': '2025-09-15 03:21:43.292317', 'step': 2633, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:43.325492', 'step': 2633, 'epoch': 3} {'type': 'loss', 'content': 0.0004173486668150872, 'timestamp': '2025-09-15 03:21:43.327619', 'step': 2634, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:43.358780', 'step': 2634, 'epoch': 3} {'type': 'loss', 'content': 0.001391603727824986, 'timestamp': '2025-09-15 03:21:43.361082', 'step': 2635, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:43.392549', 'step': 2635, 'epoch': 3} {'type': 'loss', 'content': 0.010834120213985443, 'timestamp': '2025-09-15 03:21:43.416218', 'step': 2636, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:43.446424', 'step': 2636, 'epoch': 3} {'type': 'loss', 'content': 0.0010919544147327542, 'timestamp': '2025-09-15 03:21:43.448779', 'step': 2637, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:43.478725', 'step': 2637, 'epoch': 3} {'type': 'loss', 'content': 0.00014520598051603884, 'timestamp': '2025-09-15 03:21:43.480866', 'step': 2638, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:43.511478', 'step': 2638, 'epoch': 3} {'type': 'loss', 'content': 0.0008089410257525742, 'timestamp': '2025-09-15 03:21:43.513671', 'step': 2639, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:43.543782', 'step': 2639, 'epoch': 3} {'type': 'loss', 'content': 0.0015242323279380798, 'timestamp': '2025-09-15 03:21:43.567474', 'step': 2640, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:43.598663', 'step': 2640, 'epoch': 3} {'type': 'loss', 'content': 0.00014808375271968544, 'timestamp': '2025-09-15 03:21:43.600673', 'step': 2641, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:43.632624', 'step': 2641, 'epoch': 3} {'type': 'loss', 'content': 0.00033712037838995457, 'timestamp': '2025-09-15 03:21:43.634965', 'step': 2642, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:43.665508', 'step': 2642, 'epoch': 3} {'type': 'loss', 'content': 0.0004964851541444659, 'timestamp': '2025-09-15 03:21:43.669020', 'step': 2643, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:43.699348', 'step': 2643, 'epoch': 3} {'type': 'loss', 'content': 0.001874964451417327, 'timestamp': '2025-09-15 03:21:43.722884', 'step': 2644, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:43.753500', 'step': 2644, 'epoch': 3} {'type': 'loss', 'content': 0.00029709740192629397, 'timestamp': '2025-09-15 03:21:43.755674', 'step': 2645, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:43.785929', 'step': 2645, 'epoch': 3} {'type': 'loss', 'content': 0.0006256395718082786, 'timestamp': '2025-09-15 03:21:43.788278', 'step': 2646, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:43.818871', 'step': 2646, 'epoch': 3} {'type': 'loss', 'content': 0.01808498241007328, 'timestamp': '2025-09-15 03:21:43.821063', 'step': 2647, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:43.851012', 'step': 2647, 'epoch': 3} {'type': 'loss', 'content': 0.001829305081628263, 'timestamp': '2025-09-15 03:21:43.874891', 'step': 2648, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:43.906591', 'step': 2648, 'epoch': 3} {'type': 'loss', 'content': 0.0001833066053222865, 'timestamp': '2025-09-15 03:21:43.908998', 'step': 2649, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:43.939599', 'step': 2649, 'epoch': 3} {'type': 'loss', 'content': 0.00026104258722625673, 'timestamp': '2025-09-15 03:21:43.942064', 'step': 2650, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:43.972713', 'step': 2650, 'epoch': 3} {'type': 'loss', 'content': 0.00035324011696502566, 'timestamp': '2025-09-15 03:21:43.975541', 'step': 2651, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:44.006336', 'step': 2651, 'epoch': 3} {'type': 'loss', 'content': 0.0006148394313640893, 'timestamp': '2025-09-15 03:21:44.029899', 'step': 2652, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:44.061125', 'step': 2652, 'epoch': 3} {'type': 'loss', 'content': 0.00014098930114414543, 'timestamp': '2025-09-15 03:21:44.063268', 'step': 2653, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:44.093846', 'step': 2653, 'epoch': 3} {'type': 'loss', 'content': 0.00035440208739601076, 'timestamp': '2025-09-15 03:21:44.096056', 'step': 2654, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:44.127747', 'step': 2654, 'epoch': 3} {'type': 'loss', 'content': 0.0009488330106250942, 'timestamp': '2025-09-15 03:21:44.129929', 'step': 2655, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:44.161315', 'step': 2655, 'epoch': 3} {'type': 'loss', 'content': 0.0007009514956735075, 'timestamp': '2025-09-15 03:21:44.184869', 'step': 2656, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:44.215528', 'step': 2656, 'epoch': 3} {'type': 'loss', 'content': 0.0003025685145985335, 'timestamp': '2025-09-15 03:21:44.217657', 'step': 2657, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:44.248413', 'step': 2657, 'epoch': 3} {'type': 'loss', 'content': 0.0018814911600202322, 'timestamp': '2025-09-15 03:21:44.250676', 'step': 2658, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:44.281706', 'step': 2658, 'epoch': 3} {'type': 'loss', 'content': 0.00033558739232830703, 'timestamp': '2025-09-15 03:21:44.283788', 'step': 2659, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:44.314900', 'step': 2659, 'epoch': 3} {'type': 'loss', 'content': 0.00040636741323396564, 'timestamp': '2025-09-15 03:21:44.338398', 'step': 2660, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:44.368751', 'step': 2660, 'epoch': 3} {'type': 'loss', 'content': 0.00021982923499308527, 'timestamp': '2025-09-15 03:21:44.370940', 'step': 2661, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:44.402560', 'step': 2661, 'epoch': 3} {'type': 'loss', 'content': 0.0003634126915130764, 'timestamp': '2025-09-15 03:21:44.404651', 'step': 2662, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:44.434809', 'step': 2662, 'epoch': 3} {'type': 'loss', 'content': 0.00037710348260588944, 'timestamp': '2025-09-15 03:21:44.437307', 'step': 2663, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:44.467423', 'step': 2663, 'epoch': 3} {'type': 'loss', 'content': 0.009560467675328255, 'timestamp': '2025-09-15 03:21:44.491184', 'step': 2664, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:44.522005', 'step': 2664, 'epoch': 3} {'type': 'loss', 'content': 0.00657200813293457, 'timestamp': '2025-09-15 03:21:44.524383', 'step': 2665, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:44.555113', 'step': 2665, 'epoch': 3} {'type': 'loss', 'content': 0.00010232098429696634, 'timestamp': '2025-09-15 03:21:44.557346', 'step': 2666, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:44.588029', 'step': 2666, 'epoch': 3} {'type': 'loss', 'content': 0.00013133182073943317, 'timestamp': '2025-09-15 03:21:44.590241', 'step': 2667, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:44.621159', 'step': 2667, 'epoch': 3} {'type': 'loss', 'content': 0.013489065691828728, 'timestamp': '2025-09-15 03:21:44.644643', 'step': 2668, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:44.675978', 'step': 2668, 'epoch': 3} {'type': 'loss', 'content': 0.0016374537954106927, 'timestamp': '2025-09-15 03:21:44.678154', 'step': 2669, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:44.710450', 'step': 2669, 'epoch': 3} {'type': 'loss', 'content': 0.004283093381673098, 'timestamp': '2025-09-15 03:21:44.712627', 'step': 2670, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:44.744147', 'step': 2670, 'epoch': 3} {'type': 'loss', 'content': 7.831586844986305e-05, 'timestamp': '2025-09-15 03:21:44.746383', 'step': 2671, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:44.777353', 'step': 2671, 'epoch': 3} {'type': 'loss', 'content': 0.00014679950254503638, 'timestamp': '2025-09-15 03:21:44.801185', 'step': 2672, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:44.831635', 'step': 2672, 'epoch': 3} {'type': 'loss', 'content': 0.002847463358193636, 'timestamp': '2025-09-15 03:21:44.833801', 'step': 2673, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:44.864556', 'step': 2673, 'epoch': 3} {'type': 'loss', 'content': 7.834843563614413e-05, 'timestamp': '2025-09-15 03:21:44.866636', 'step': 2674, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:44.897191', 'step': 2674, 'epoch': 3} {'type': 'loss', 'content': 0.0002578691055532545, 'timestamp': '2025-09-15 03:21:44.899298', 'step': 2675, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:44.930149', 'step': 2675, 'epoch': 3} {'type': 'loss', 'content': 0.0002215011336375028, 'timestamp': '2025-09-15 03:21:44.953730', 'step': 2676, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:44.984333', 'step': 2676, 'epoch': 3} {'type': 'loss', 'content': 0.0002951612987089902, 'timestamp': '2025-09-15 03:21:44.987270', 'step': 2677, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:45.017372', 'step': 2677, 'epoch': 3} {'type': 'loss', 'content': 0.0018803089624270797, 'timestamp': '2025-09-15 03:21:45.019468', 'step': 2678, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:45.050064', 'step': 2678, 'epoch': 3} {'type': 'loss', 'content': 0.00012660605716519058, 'timestamp': '2025-09-15 03:21:45.052417', 'step': 2679, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:21:45.785186', 'step': 2679, 'epoch': 3} {'type': 'pplx', 'content': 68281292.55607095, 'timestamp': '2025-09-15 03:21:45.787200', 'step': 2679, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:45.816800', 'step': 2679, 'epoch': 3} {'type': 'loss', 'content': 0.0001412067358614877, 'timestamp': '2025-09-15 03:21:45.840625', 'step': 2680, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:45.872075', 'step': 2680, 'epoch': 3} {'type': 'loss', 'content': 0.0005017804214730859, 'timestamp': '2025-09-15 03:21:45.874161', 'step': 2681, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:45.905325', 'step': 2681, 'epoch': 3} {'type': 'loss', 'content': 0.0001562709512654692, 'timestamp': '2025-09-15 03:21:45.907477', 'step': 2682, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:45.938281', 'step': 2682, 'epoch': 3} {'type': 'loss', 'content': 5.530984708457254e-05, 'timestamp': '2025-09-15 03:21:45.948906', 'step': 2683, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:45.987811', 'step': 2683, 'epoch': 3} {'type': 'loss', 'content': 0.0002064589352812618, 'timestamp': '2025-09-15 03:21:46.011675', 'step': 2684, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:46.042031', 'step': 2684, 'epoch': 3} {'type': 'loss', 'content': 4.4304611947154626e-05, 'timestamp': '2025-09-15 03:21:46.044071', 'step': 2685, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:46.074494', 'step': 2685, 'epoch': 3} {'type': 'loss', 'content': 0.00019901491759810597, 'timestamp': '2025-09-15 03:21:46.076772', 'step': 2686, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:46.107441', 'step': 2686, 'epoch': 3} {'type': 'loss', 'content': 0.0010612963233143091, 'timestamp': '2025-09-15 03:21:46.109567', 'step': 2687, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:46.140285', 'step': 2687, 'epoch': 3} {'type': 'loss', 'content': 0.0007707072654739022, 'timestamp': '2025-09-15 03:21:46.163862', 'step': 2688, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:46.194353', 'step': 2688, 'epoch': 3} {'type': 'loss', 'content': 9.548853995511308e-05, 'timestamp': '2025-09-15 03:21:46.196595', 'step': 2689, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:46.227286', 'step': 2689, 'epoch': 3} {'type': 'loss', 'content': 8.759932825341821e-05, 'timestamp': '2025-09-15 03:21:46.229406', 'step': 2690, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:46.261499', 'step': 2690, 'epoch': 3} {'type': 'loss', 'content': 0.00026428294950164855, 'timestamp': '2025-09-15 03:21:46.264240', 'step': 2691, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:46.295436', 'step': 2691, 'epoch': 3} {'type': 'loss', 'content': 0.0005423675174824893, 'timestamp': '2025-09-15 03:21:46.319207', 'step': 2692, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:46.349665', 'step': 2692, 'epoch': 3} {'type': 'loss', 'content': 6.664798274869099e-05, 'timestamp': '2025-09-15 03:21:46.352016', 'step': 2693, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:46.382850', 'step': 2693, 'epoch': 3} {'type': 'loss', 'content': 0.00035035648033954203, 'timestamp': '2025-09-15 03:21:46.385436', 'step': 2694, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:46.417601', 'step': 2694, 'epoch': 3} {'type': 'loss', 'content': 0.0007025161758065224, 'timestamp': '2025-09-15 03:21:46.419788', 'step': 2695, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:46.450857', 'step': 2695, 'epoch': 3} {'type': 'loss', 'content': 0.0002691586851142347, 'timestamp': '2025-09-15 03:21:46.474565', 'step': 2696, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:46.505154', 'step': 2696, 'epoch': 3} {'type': 'loss', 'content': 0.00010667191963875666, 'timestamp': '2025-09-15 03:21:46.507460', 'step': 2697, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:46.537799', 'step': 2697, 'epoch': 3} {'type': 'loss', 'content': 0.00014287196972873062, 'timestamp': '2025-09-15 03:21:46.540096', 'step': 2698, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:46.570506', 'step': 2698, 'epoch': 3} {'type': 'loss', 'content': 0.004734222777187824, 'timestamp': '2025-09-15 03:21:46.572714', 'step': 2699, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:46.603948', 'step': 2699, 'epoch': 3} {'type': 'loss', 'content': 9.962467447621748e-05, 'timestamp': '2025-09-15 03:21:46.628968', 'step': 2700, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:46.659915', 'step': 2700, 'epoch': 3} {'type': 'loss', 'content': 0.00013182398106437176, 'timestamp': '2025-09-15 03:21:46.663241', 'step': 2701, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:46.694487', 'step': 2701, 'epoch': 3} {'type': 'loss', 'content': 8.584625174989924e-05, 'timestamp': '2025-09-15 03:21:46.696182', 'step': 2702, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:46.726822', 'step': 2702, 'epoch': 3} {'type': 'loss', 'content': 0.00016575524932704866, 'timestamp': '2025-09-15 03:21:46.728860', 'step': 2703, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:46.759112', 'step': 2703, 'epoch': 3} {'type': 'loss', 'content': 9.599862096365541e-05, 'timestamp': '2025-09-15 03:21:46.782761', 'step': 2704, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:46.814339', 'step': 2704, 'epoch': 3} {'type': 'loss', 'content': 0.0006985267391428351, 'timestamp': '2025-09-15 03:21:46.817011', 'step': 2705, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:46.848214', 'step': 2705, 'epoch': 3} {'type': 'loss', 'content': 0.0006570377154275775, 'timestamp': '2025-09-15 03:21:46.850706', 'step': 2706, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:46.881338', 'step': 2706, 'epoch': 3} {'type': 'loss', 'content': 5.436737774289213e-05, 'timestamp': '2025-09-15 03:21:46.883412', 'step': 2707, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:46.913573', 'step': 2707, 'epoch': 3} {'type': 'loss', 'content': 0.00018206385720986873, 'timestamp': '2025-09-15 03:21:46.937354', 'step': 2708, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:46.969934', 'step': 2708, 'epoch': 3} {'type': 'loss', 'content': 0.00015278191131073982, 'timestamp': '2025-09-15 03:21:46.972011', 'step': 2709, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:47.002584', 'step': 2709, 'epoch': 3} {'type': 'loss', 'content': 0.00026051508029922843, 'timestamp': '2025-09-15 03:21:47.004840', 'step': 2710, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:47.035438', 'step': 2710, 'epoch': 3} {'type': 'loss', 'content': 7.116630877135321e-05, 'timestamp': '2025-09-15 03:21:47.037632', 'step': 2711, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:47.069438', 'step': 2711, 'epoch': 3} {'type': 'loss', 'content': 4.763676770380698e-05, 'timestamp': '2025-09-15 03:21:47.092859', 'step': 2712, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:47.124309', 'step': 2712, 'epoch': 3} {'type': 'loss', 'content': 0.0005188211798667908, 'timestamp': '2025-09-15 03:21:47.126479', 'step': 2713, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:47.157195', 'step': 2713, 'epoch': 3} {'type': 'loss', 'content': 0.014473472721874714, 'timestamp': '2025-09-15 03:21:47.159377', 'step': 2714, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:47.190566', 'step': 2714, 'epoch': 3} {'type': 'loss', 'content': 0.022033294662833214, 'timestamp': '2025-09-15 03:21:47.192938', 'step': 2715, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:47.223935', 'step': 2715, 'epoch': 3} {'type': 'loss', 'content': 8.771099237492308e-05, 'timestamp': '2025-09-15 03:21:47.247565', 'step': 2716, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:47.279232', 'step': 2716, 'epoch': 3} {'type': 'loss', 'content': 0.0009455090621486306, 'timestamp': '2025-09-15 03:21:47.282112', 'step': 2717, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:47.313133', 'step': 2717, 'epoch': 3} {'type': 'loss', 'content': 2.2676438675262034e-05, 'timestamp': '2025-09-15 03:21:47.316036', 'step': 2718, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:47.346397', 'step': 2718, 'epoch': 3} {'type': 'loss', 'content': 0.000588989001698792, 'timestamp': '2025-09-15 03:21:47.348461', 'step': 2719, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:47.379888', 'step': 2719, 'epoch': 3} {'type': 'loss', 'content': 0.003522381419315934, 'timestamp': '2025-09-15 03:21:47.403617', 'step': 2720, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:47.434835', 'step': 2720, 'epoch': 3} {'type': 'loss', 'content': 9.874381794361398e-05, 'timestamp': '2025-09-15 03:21:47.437366', 'step': 2721, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:47.468699', 'step': 2721, 'epoch': 3} {'type': 'loss', 'content': 6.181753269629553e-05, 'timestamp': '2025-09-15 03:21:47.470831', 'step': 2722, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:47.503234', 'step': 2722, 'epoch': 3} {'type': 'loss', 'content': 0.021309752017259598, 'timestamp': '2025-09-15 03:21:47.505418', 'step': 2723, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:47.536114', 'step': 2723, 'epoch': 3} {'type': 'loss', 'content': 5.099936606711708e-05, 'timestamp': '2025-09-15 03:21:47.559681', 'step': 2724, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:47.590837', 'step': 2724, 'epoch': 3} {'type': 'loss', 'content': 0.019538061693310738, 'timestamp': '2025-09-15 03:21:47.593050', 'step': 2725, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:47.623804', 'step': 2725, 'epoch': 3} {'type': 'loss', 'content': 0.010590228252112865, 'timestamp': '2025-09-15 03:21:47.625964', 'step': 2726, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:47.656938', 'step': 2726, 'epoch': 3} {'type': 'loss', 'content': 0.0008069784962572157, 'timestamp': '2025-09-15 03:21:47.659467', 'step': 2727, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:47.690899', 'step': 2727, 'epoch': 3} {'type': 'loss', 'content': 0.0004390797985251993, 'timestamp': '2025-09-15 03:21:47.716573', 'step': 2728, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:47.748630', 'step': 2728, 'epoch': 3} {'type': 'loss', 'content': 0.0014442296233028173, 'timestamp': '2025-09-15 03:21:47.750823', 'step': 2729, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:47.781695', 'step': 2729, 'epoch': 3} {'type': 'loss', 'content': 0.0001858282630564645, 'timestamp': '2025-09-15 03:21:47.784050', 'step': 2730, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:47.815812', 'step': 2730, 'epoch': 3} {'type': 'loss', 'content': 8.658268779981881e-05, 'timestamp': '2025-09-15 03:21:47.817992', 'step': 2731, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:47.850328', 'step': 2731, 'epoch': 3} {'type': 'loss', 'content': 0.0170897264033556, 'timestamp': '2025-09-15 03:21:47.873838', 'step': 2732, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:47.905086', 'step': 2732, 'epoch': 3} {'type': 'loss', 'content': 0.00023263516777660698, 'timestamp': '2025-09-15 03:21:47.907703', 'step': 2733, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:47.938197', 'step': 2733, 'epoch': 3} {'type': 'loss', 'content': 0.00016691691416781396, 'timestamp': '2025-09-15 03:21:47.940489', 'step': 2734, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:47.971067', 'step': 2734, 'epoch': 3} {'type': 'loss', 'content': 5.8849462220678106e-05, 'timestamp': '2025-09-15 03:21:47.973257', 'step': 2735, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:48.004099', 'step': 2735, 'epoch': 3} {'type': 'loss', 'content': 0.018125424161553383, 'timestamp': '2025-09-15 03:21:48.027968', 'step': 2736, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:21:48.761944', 'step': 2736, 'epoch': 3} {'type': 'pplx', 'content': 67706275.31001619, 'timestamp': '2025-09-15 03:21:48.764361', 'step': 2736, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:48.794358', 'step': 2736, 'epoch': 3} {'type': 'loss', 'content': 0.00023372893338091671, 'timestamp': '2025-09-15 03:21:48.796425', 'step': 2737, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:48.826981', 'step': 2737, 'epoch': 3} {'type': 'loss', 'content': 0.00027472517103888094, 'timestamp': '2025-09-15 03:21:48.829412', 'step': 2738, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:48.861155', 'step': 2738, 'epoch': 3} {'type': 'loss', 'content': 0.0010784949408844113, 'timestamp': '2025-09-15 03:21:48.863355', 'step': 2739, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:48.895491', 'step': 2739, 'epoch': 3} {'type': 'loss', 'content': 0.00026522178086452186, 'timestamp': '2025-09-15 03:21:48.919244', 'step': 2740, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:48.949712', 'step': 2740, 'epoch': 3} {'type': 'loss', 'content': 0.0005690989783033729, 'timestamp': '2025-09-15 03:21:48.951854', 'step': 2741, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:48.983057', 'step': 2741, 'epoch': 3} {'type': 'loss', 'content': 0.003584182122722268, 'timestamp': '2025-09-15 03:21:48.985187', 'step': 2742, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:49.016147', 'step': 2742, 'epoch': 3} {'type': 'loss', 'content': 0.01012119185179472, 'timestamp': '2025-09-15 03:21:49.018323', 'step': 2743, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:49.049155', 'step': 2743, 'epoch': 3} {'type': 'loss', 'content': 0.008594135753810406, 'timestamp': '2025-09-15 03:21:49.072888', 'step': 2744, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:49.103719', 'step': 2744, 'epoch': 3} {'type': 'loss', 'content': 0.0050062802620232105, 'timestamp': '2025-09-15 03:21:49.106022', 'step': 2745, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:49.138939', 'step': 2745, 'epoch': 3} {'type': 'loss', 'content': 0.023680929094552994, 'timestamp': '2025-09-15 03:21:49.141087', 'step': 2746, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:49.171554', 'step': 2746, 'epoch': 3} {'type': 'loss', 'content': 0.0008458025404252112, 'timestamp': '2025-09-15 03:21:49.173832', 'step': 2747, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 112], 'flops': 3322488817984}, 'timestamp': '2025-09-15 03:21:49.204668', 'step': 2747, 'epoch': 3} {'type': 'loss', 'content': 0.014508053660392761, 'timestamp': '2025-09-15 03:21:49.228490', 'step': 2748, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:49.259521', 'step': 2748, 'epoch': 3} {'type': 'loss', 'content': 0.002385776722803712, 'timestamp': '2025-09-15 03:21:49.261831', 'step': 2749, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 128], 'flops': 3797092544000}, 'timestamp': '2025-09-15 03:21:49.292328', 'step': 2749, 'epoch': 3} {'type': 'loss', 'content': 0.04963759332895279, 'timestamp': '2025-09-15 03:21:49.294679', 'step': 2750, 'epoch': 3} {'type': 'flops', 'content': {'type': 'train', 'batch_dim': [4, 144], 'flops': 4271696270016}, 'timestamp': '2025-09-15 03:21:49.326062', 'step': 2750, 'epoch': 3} {'type': 'loss', 'content': 0.0025882285553961992, 'timestamp': '2025-09-15 03:21:49.328510', 'step': 2751, 'epoch': 3} {'type': 'flops', 'content': [{'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 112], 'batch_size': 8, 'flops': 2214805229440}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 80], 'batch_size': 8, 'flops': 1582003754624}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}, {'type': 'perplexity', 'in_batch_dim': [8, 96], 'batch_size': 8, 'flops': 1898404492032}], 'timestamp': '2025-09-15 03:21:50.073511', 'step': 2751, 'epoch': 3} {'type': 'pplx', 'content': 65059509.345948905, 'timestamp': '2025-09-15 03:21:50.075790', 'step': 2751, 'epoch': 3} {'type': 'best_pplx', 'content': 43281895.896381795, 'timestamp': '2025-09-15 03:21:50.077423', 'step': 2751, 'epoch': 3} {'type': 'best_step', 'content': 2508, 'timestamp': '2025-09-15 03:21:50.078993', 'step': 2751, 'epoch': 3} {'type': 'total_pplx_flops', 'content': 5014951860256000, 'timestamp': '2025-09-15 03:21:50.080603', 'step': 2751, 'epoch': 3} {'type': 'total_train_flops', 'content': 10640863719936576, 'timestamp': '2025-09-15 03:21:50.082991', 'step': 2751, 'epoch': 3}