Uploaded checkpoint-27500
Browse files- adapter_model.safetensors +1 -1
- optimizer.pt +1 -1
- rng_state.pth +1 -1
- scheduler.pt +1 -1
- trainer_state.json +1761 -3
adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 119975656
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8ad86e91f2924a189dc19b796deef58a4c1f44b9596040dfbed596e3a58a58a4
|
| 3 |
size 119975656
|
optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 240145026
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e6dcec2fc7bddf9ccd947d61cfbb7ec0d09e233cfdb81b4d8f00e3043b60ec27
|
| 3 |
size 240145026
|
rng_state.pth
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 14244
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:971f8c6b700d32d9d1711207ade77f4dca9cda1be000e561bca9b74000ac50f5
|
| 3 |
size 14244
|
scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1064
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bae572518ab53ddc674f52a5ef01613875bea64a8d9c53d4b7d4a9aedc712f19
|
| 3 |
size 1064
|
trainer_state.json
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
{
|
| 2 |
"best_metric": 1.2961933612823486,
|
| 3 |
"best_model_checkpoint": "runs/deepseek_lora_20240422-141601/checkpoint-25000",
|
| 4 |
-
"epoch": 0.
|
| 5 |
"eval_steps": 2500,
|
| 6 |
-
"global_step":
|
| 7 |
"is_hyper_param_search": false,
|
| 8 |
"is_local_process_zero": true,
|
| 9 |
"is_world_process_zero": true,
|
|
@@ -17587,6 +17587,1764 @@
|
|
| 17587 |
"eval_samples_per_second": 8.174,
|
| 17588 |
"eval_steps_per_second": 8.174,
|
| 17589 |
"step": 25000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17590 |
}
|
| 17591 |
],
|
| 17592 |
"logging_steps": 10,
|
|
@@ -17594,7 +19352,7 @@
|
|
| 17594 |
"num_input_tokens_seen": 0,
|
| 17595 |
"num_train_epochs": 1,
|
| 17596 |
"save_steps": 2500,
|
| 17597 |
-
"total_flos": 4.
|
| 17598 |
"train_batch_size": 1,
|
| 17599 |
"trial_name": null,
|
| 17600 |
"trial_params": null
|
|
|
|
| 1 |
{
|
| 2 |
"best_metric": 1.2961933612823486,
|
| 3 |
"best_model_checkpoint": "runs/deepseek_lora_20240422-141601/checkpoint-25000",
|
| 4 |
+
"epoch": 0.6875,
|
| 5 |
"eval_steps": 2500,
|
| 6 |
+
"global_step": 27500,
|
| 7 |
"is_hyper_param_search": false,
|
| 8 |
"is_local_process_zero": true,
|
| 9 |
"is_world_process_zero": true,
|
|
|
|
| 17587 |
"eval_samples_per_second": 8.174,
|
| 17588 |
"eval_steps_per_second": 8.174,
|
| 17589 |
"step": 25000
|
| 17590 |
+
},
|
| 17591 |
+
{
|
| 17592 |
+
"epoch": 0.63,
|
| 17593 |
+
"grad_norm": 7.878478527069092,
|
| 17594 |
+
"learning_rate": 3.3830508474576273e-06,
|
| 17595 |
+
"loss": 1.5645,
|
| 17596 |
+
"step": 25010
|
| 17597 |
+
},
|
| 17598 |
+
{
|
| 17599 |
+
"epoch": 0.63,
|
| 17600 |
+
"grad_norm": 4.464993000030518,
|
| 17601 |
+
"learning_rate": 3.3762711864406783e-06,
|
| 17602 |
+
"loss": 1.2522,
|
| 17603 |
+
"step": 25020
|
| 17604 |
+
},
|
| 17605 |
+
{
|
| 17606 |
+
"epoch": 0.63,
|
| 17607 |
+
"grad_norm": 1.824704885482788,
|
| 17608 |
+
"learning_rate": 3.3694915254237292e-06,
|
| 17609 |
+
"loss": 1.1667,
|
| 17610 |
+
"step": 25030
|
| 17611 |
+
},
|
| 17612 |
+
{
|
| 17613 |
+
"epoch": 0.63,
|
| 17614 |
+
"grad_norm": 4.979220390319824,
|
| 17615 |
+
"learning_rate": 3.3627118644067802e-06,
|
| 17616 |
+
"loss": 1.4592,
|
| 17617 |
+
"step": 25040
|
| 17618 |
+
},
|
| 17619 |
+
{
|
| 17620 |
+
"epoch": 0.63,
|
| 17621 |
+
"grad_norm": 7.964636325836182,
|
| 17622 |
+
"learning_rate": 3.3559322033898308e-06,
|
| 17623 |
+
"loss": 1.3505,
|
| 17624 |
+
"step": 25050
|
| 17625 |
+
},
|
| 17626 |
+
{
|
| 17627 |
+
"epoch": 0.63,
|
| 17628 |
+
"grad_norm": 3.848740816116333,
|
| 17629 |
+
"learning_rate": 3.3491525423728817e-06,
|
| 17630 |
+
"loss": 1.3424,
|
| 17631 |
+
"step": 25060
|
| 17632 |
+
},
|
| 17633 |
+
{
|
| 17634 |
+
"epoch": 0.63,
|
| 17635 |
+
"grad_norm": 5.446021556854248,
|
| 17636 |
+
"learning_rate": 3.3423728813559327e-06,
|
| 17637 |
+
"loss": 1.3514,
|
| 17638 |
+
"step": 25070
|
| 17639 |
+
},
|
| 17640 |
+
{
|
| 17641 |
+
"epoch": 0.63,
|
| 17642 |
+
"grad_norm": 4.195797920227051,
|
| 17643 |
+
"learning_rate": 3.3355932203389833e-06,
|
| 17644 |
+
"loss": 1.4442,
|
| 17645 |
+
"step": 25080
|
| 17646 |
+
},
|
| 17647 |
+
{
|
| 17648 |
+
"epoch": 0.63,
|
| 17649 |
+
"grad_norm": 5.259161949157715,
|
| 17650 |
+
"learning_rate": 3.3288135593220343e-06,
|
| 17651 |
+
"loss": 1.4244,
|
| 17652 |
+
"step": 25090
|
| 17653 |
+
},
|
| 17654 |
+
{
|
| 17655 |
+
"epoch": 0.63,
|
| 17656 |
+
"grad_norm": 4.59972620010376,
|
| 17657 |
+
"learning_rate": 3.322033898305085e-06,
|
| 17658 |
+
"loss": 1.3104,
|
| 17659 |
+
"step": 25100
|
| 17660 |
+
},
|
| 17661 |
+
{
|
| 17662 |
+
"epoch": 0.63,
|
| 17663 |
+
"grad_norm": 4.761903762817383,
|
| 17664 |
+
"learning_rate": 3.3152542372881358e-06,
|
| 17665 |
+
"loss": 1.3031,
|
| 17666 |
+
"step": 25110
|
| 17667 |
+
},
|
| 17668 |
+
{
|
| 17669 |
+
"epoch": 0.63,
|
| 17670 |
+
"grad_norm": 10.678994178771973,
|
| 17671 |
+
"learning_rate": 3.3084745762711868e-06,
|
| 17672 |
+
"loss": 1.372,
|
| 17673 |
+
"step": 25120
|
| 17674 |
+
},
|
| 17675 |
+
{
|
| 17676 |
+
"epoch": 0.63,
|
| 17677 |
+
"grad_norm": 5.033021926879883,
|
| 17678 |
+
"learning_rate": 3.3016949152542377e-06,
|
| 17679 |
+
"loss": 1.3018,
|
| 17680 |
+
"step": 25130
|
| 17681 |
+
},
|
| 17682 |
+
{
|
| 17683 |
+
"epoch": 0.63,
|
| 17684 |
+
"grad_norm": 14.870203971862793,
|
| 17685 |
+
"learning_rate": 3.2949152542372887e-06,
|
| 17686 |
+
"loss": 1.3284,
|
| 17687 |
+
"step": 25140
|
| 17688 |
+
},
|
| 17689 |
+
{
|
| 17690 |
+
"epoch": 0.63,
|
| 17691 |
+
"grad_norm": 1.7636396884918213,
|
| 17692 |
+
"learning_rate": 3.288135593220339e-06,
|
| 17693 |
+
"loss": 1.3044,
|
| 17694 |
+
"step": 25150
|
| 17695 |
+
},
|
| 17696 |
+
{
|
| 17697 |
+
"epoch": 0.63,
|
| 17698 |
+
"grad_norm": 2.6163928508758545,
|
| 17699 |
+
"learning_rate": 3.28135593220339e-06,
|
| 17700 |
+
"loss": 1.1875,
|
| 17701 |
+
"step": 25160
|
| 17702 |
+
},
|
| 17703 |
+
{
|
| 17704 |
+
"epoch": 0.63,
|
| 17705 |
+
"grad_norm": 4.163614273071289,
|
| 17706 |
+
"learning_rate": 3.274576271186441e-06,
|
| 17707 |
+
"loss": 1.3458,
|
| 17708 |
+
"step": 25170
|
| 17709 |
+
},
|
| 17710 |
+
{
|
| 17711 |
+
"epoch": 0.63,
|
| 17712 |
+
"grad_norm": 9.529121398925781,
|
| 17713 |
+
"learning_rate": 3.2677966101694918e-06,
|
| 17714 |
+
"loss": 1.2659,
|
| 17715 |
+
"step": 25180
|
| 17716 |
+
},
|
| 17717 |
+
{
|
| 17718 |
+
"epoch": 0.63,
|
| 17719 |
+
"grad_norm": 10.017566680908203,
|
| 17720 |
+
"learning_rate": 3.2610169491525428e-06,
|
| 17721 |
+
"loss": 1.4239,
|
| 17722 |
+
"step": 25190
|
| 17723 |
+
},
|
| 17724 |
+
{
|
| 17725 |
+
"epoch": 0.63,
|
| 17726 |
+
"grad_norm": 4.03933048248291,
|
| 17727 |
+
"learning_rate": 3.2542372881355933e-06,
|
| 17728 |
+
"loss": 1.252,
|
| 17729 |
+
"step": 25200
|
| 17730 |
+
},
|
| 17731 |
+
{
|
| 17732 |
+
"epoch": 0.63,
|
| 17733 |
+
"grad_norm": 6.725574970245361,
|
| 17734 |
+
"learning_rate": 3.2474576271186443e-06,
|
| 17735 |
+
"loss": 1.1699,
|
| 17736 |
+
"step": 25210
|
| 17737 |
+
},
|
| 17738 |
+
{
|
| 17739 |
+
"epoch": 0.63,
|
| 17740 |
+
"grad_norm": 8.446992874145508,
|
| 17741 |
+
"learning_rate": 3.2406779661016953e-06,
|
| 17742 |
+
"loss": 1.2996,
|
| 17743 |
+
"step": 25220
|
| 17744 |
+
},
|
| 17745 |
+
{
|
| 17746 |
+
"epoch": 0.63,
|
| 17747 |
+
"grad_norm": 15.130813598632812,
|
| 17748 |
+
"learning_rate": 3.2338983050847462e-06,
|
| 17749 |
+
"loss": 1.1601,
|
| 17750 |
+
"step": 25230
|
| 17751 |
+
},
|
| 17752 |
+
{
|
| 17753 |
+
"epoch": 0.63,
|
| 17754 |
+
"grad_norm": 8.915867805480957,
|
| 17755 |
+
"learning_rate": 3.2271186440677972e-06,
|
| 17756 |
+
"loss": 1.4512,
|
| 17757 |
+
"step": 25240
|
| 17758 |
+
},
|
| 17759 |
+
{
|
| 17760 |
+
"epoch": 0.63,
|
| 17761 |
+
"grad_norm": 3.813676595687866,
|
| 17762 |
+
"learning_rate": 3.2203389830508473e-06,
|
| 17763 |
+
"loss": 1.4311,
|
| 17764 |
+
"step": 25250
|
| 17765 |
+
},
|
| 17766 |
+
{
|
| 17767 |
+
"epoch": 0.63,
|
| 17768 |
+
"grad_norm": 8.306417465209961,
|
| 17769 |
+
"learning_rate": 3.2135593220338983e-06,
|
| 17770 |
+
"loss": 1.2554,
|
| 17771 |
+
"step": 25260
|
| 17772 |
+
},
|
| 17773 |
+
{
|
| 17774 |
+
"epoch": 0.63,
|
| 17775 |
+
"grad_norm": 4.557586193084717,
|
| 17776 |
+
"learning_rate": 3.2067796610169493e-06,
|
| 17777 |
+
"loss": 1.4084,
|
| 17778 |
+
"step": 25270
|
| 17779 |
+
},
|
| 17780 |
+
{
|
| 17781 |
+
"epoch": 0.63,
|
| 17782 |
+
"grad_norm": 3.206493616104126,
|
| 17783 |
+
"learning_rate": 3.2000000000000003e-06,
|
| 17784 |
+
"loss": 1.3265,
|
| 17785 |
+
"step": 25280
|
| 17786 |
+
},
|
| 17787 |
+
{
|
| 17788 |
+
"epoch": 0.63,
|
| 17789 |
+
"grad_norm": 4.9644622802734375,
|
| 17790 |
+
"learning_rate": 3.1932203389830513e-06,
|
| 17791 |
+
"loss": 1.3932,
|
| 17792 |
+
"step": 25290
|
| 17793 |
+
},
|
| 17794 |
+
{
|
| 17795 |
+
"epoch": 0.63,
|
| 17796 |
+
"grad_norm": 3.4366214275360107,
|
| 17797 |
+
"learning_rate": 3.186440677966102e-06,
|
| 17798 |
+
"loss": 1.4233,
|
| 17799 |
+
"step": 25300
|
| 17800 |
+
},
|
| 17801 |
+
{
|
| 17802 |
+
"epoch": 0.63,
|
| 17803 |
+
"grad_norm": 6.7854461669921875,
|
| 17804 |
+
"learning_rate": 3.1796610169491528e-06,
|
| 17805 |
+
"loss": 1.2838,
|
| 17806 |
+
"step": 25310
|
| 17807 |
+
},
|
| 17808 |
+
{
|
| 17809 |
+
"epoch": 0.63,
|
| 17810 |
+
"grad_norm": 6.945521831512451,
|
| 17811 |
+
"learning_rate": 3.1728813559322038e-06,
|
| 17812 |
+
"loss": 1.2936,
|
| 17813 |
+
"step": 25320
|
| 17814 |
+
},
|
| 17815 |
+
{
|
| 17816 |
+
"epoch": 0.63,
|
| 17817 |
+
"grad_norm": 4.550187587738037,
|
| 17818 |
+
"learning_rate": 3.1661016949152547e-06,
|
| 17819 |
+
"loss": 1.3477,
|
| 17820 |
+
"step": 25330
|
| 17821 |
+
},
|
| 17822 |
+
{
|
| 17823 |
+
"epoch": 0.63,
|
| 17824 |
+
"grad_norm": 5.256103515625,
|
| 17825 |
+
"learning_rate": 3.1593220338983053e-06,
|
| 17826 |
+
"loss": 1.4062,
|
| 17827 |
+
"step": 25340
|
| 17828 |
+
},
|
| 17829 |
+
{
|
| 17830 |
+
"epoch": 0.63,
|
| 17831 |
+
"grad_norm": 11.691208839416504,
|
| 17832 |
+
"learning_rate": 3.1525423728813563e-06,
|
| 17833 |
+
"loss": 1.1473,
|
| 17834 |
+
"step": 25350
|
| 17835 |
+
},
|
| 17836 |
+
{
|
| 17837 |
+
"epoch": 0.63,
|
| 17838 |
+
"grad_norm": 5.051747798919678,
|
| 17839 |
+
"learning_rate": 3.145762711864407e-06,
|
| 17840 |
+
"loss": 1.4299,
|
| 17841 |
+
"step": 25360
|
| 17842 |
+
},
|
| 17843 |
+
{
|
| 17844 |
+
"epoch": 0.63,
|
| 17845 |
+
"grad_norm": 2.562920093536377,
|
| 17846 |
+
"learning_rate": 3.138983050847458e-06,
|
| 17847 |
+
"loss": 1.2428,
|
| 17848 |
+
"step": 25370
|
| 17849 |
+
},
|
| 17850 |
+
{
|
| 17851 |
+
"epoch": 0.63,
|
| 17852 |
+
"grad_norm": 7.520709037780762,
|
| 17853 |
+
"learning_rate": 3.1322033898305088e-06,
|
| 17854 |
+
"loss": 1.1736,
|
| 17855 |
+
"step": 25380
|
| 17856 |
+
},
|
| 17857 |
+
{
|
| 17858 |
+
"epoch": 0.63,
|
| 17859 |
+
"grad_norm": 5.788994789123535,
|
| 17860 |
+
"learning_rate": 3.1254237288135598e-06,
|
| 17861 |
+
"loss": 1.3141,
|
| 17862 |
+
"step": 25390
|
| 17863 |
+
},
|
| 17864 |
+
{
|
| 17865 |
+
"epoch": 0.64,
|
| 17866 |
+
"grad_norm": 17.664766311645508,
|
| 17867 |
+
"learning_rate": 3.1186440677966107e-06,
|
| 17868 |
+
"loss": 1.357,
|
| 17869 |
+
"step": 25400
|
| 17870 |
+
},
|
| 17871 |
+
{
|
| 17872 |
+
"epoch": 0.64,
|
| 17873 |
+
"grad_norm": 4.366672992706299,
|
| 17874 |
+
"learning_rate": 3.111864406779661e-06,
|
| 17875 |
+
"loss": 1.3631,
|
| 17876 |
+
"step": 25410
|
| 17877 |
+
},
|
| 17878 |
+
{
|
| 17879 |
+
"epoch": 0.64,
|
| 17880 |
+
"grad_norm": 15.109098434448242,
|
| 17881 |
+
"learning_rate": 3.105084745762712e-06,
|
| 17882 |
+
"loss": 1.5056,
|
| 17883 |
+
"step": 25420
|
| 17884 |
+
},
|
| 17885 |
+
{
|
| 17886 |
+
"epoch": 0.64,
|
| 17887 |
+
"grad_norm": 12.550411224365234,
|
| 17888 |
+
"learning_rate": 3.098305084745763e-06,
|
| 17889 |
+
"loss": 1.2255,
|
| 17890 |
+
"step": 25430
|
| 17891 |
+
},
|
| 17892 |
+
{
|
| 17893 |
+
"epoch": 0.64,
|
| 17894 |
+
"grad_norm": 5.603880882263184,
|
| 17895 |
+
"learning_rate": 3.091525423728814e-06,
|
| 17896 |
+
"loss": 1.4206,
|
| 17897 |
+
"step": 25440
|
| 17898 |
+
},
|
| 17899 |
+
{
|
| 17900 |
+
"epoch": 0.64,
|
| 17901 |
+
"grad_norm": 5.365425109863281,
|
| 17902 |
+
"learning_rate": 3.0847457627118648e-06,
|
| 17903 |
+
"loss": 1.1442,
|
| 17904 |
+
"step": 25450
|
| 17905 |
+
},
|
| 17906 |
+
{
|
| 17907 |
+
"epoch": 0.64,
|
| 17908 |
+
"grad_norm": 6.900291442871094,
|
| 17909 |
+
"learning_rate": 3.0779661016949153e-06,
|
| 17910 |
+
"loss": 1.3042,
|
| 17911 |
+
"step": 25460
|
| 17912 |
+
},
|
| 17913 |
+
{
|
| 17914 |
+
"epoch": 0.64,
|
| 17915 |
+
"grad_norm": 6.29402494430542,
|
| 17916 |
+
"learning_rate": 3.0711864406779663e-06,
|
| 17917 |
+
"loss": 1.2615,
|
| 17918 |
+
"step": 25470
|
| 17919 |
+
},
|
| 17920 |
+
{
|
| 17921 |
+
"epoch": 0.64,
|
| 17922 |
+
"grad_norm": 11.43036937713623,
|
| 17923 |
+
"learning_rate": 3.0644067796610173e-06,
|
| 17924 |
+
"loss": 1.1831,
|
| 17925 |
+
"step": 25480
|
| 17926 |
+
},
|
| 17927 |
+
{
|
| 17928 |
+
"epoch": 0.64,
|
| 17929 |
+
"grad_norm": 8.487439155578613,
|
| 17930 |
+
"learning_rate": 3.0576271186440683e-06,
|
| 17931 |
+
"loss": 1.2347,
|
| 17932 |
+
"step": 25490
|
| 17933 |
+
},
|
| 17934 |
+
{
|
| 17935 |
+
"epoch": 0.64,
|
| 17936 |
+
"grad_norm": 4.873865127563477,
|
| 17937 |
+
"learning_rate": 3.0508474576271192e-06,
|
| 17938 |
+
"loss": 1.0908,
|
| 17939 |
+
"step": 25500
|
| 17940 |
+
},
|
| 17941 |
+
{
|
| 17942 |
+
"epoch": 0.64,
|
| 17943 |
+
"grad_norm": 2.734248161315918,
|
| 17944 |
+
"learning_rate": 3.0440677966101694e-06,
|
| 17945 |
+
"loss": 1.0498,
|
| 17946 |
+
"step": 25510
|
| 17947 |
+
},
|
| 17948 |
+
{
|
| 17949 |
+
"epoch": 0.64,
|
| 17950 |
+
"grad_norm": 12.489617347717285,
|
| 17951 |
+
"learning_rate": 3.0372881355932203e-06,
|
| 17952 |
+
"loss": 1.2103,
|
| 17953 |
+
"step": 25520
|
| 17954 |
+
},
|
| 17955 |
+
{
|
| 17956 |
+
"epoch": 0.64,
|
| 17957 |
+
"grad_norm": 11.316421508789062,
|
| 17958 |
+
"learning_rate": 3.0305084745762713e-06,
|
| 17959 |
+
"loss": 1.5457,
|
| 17960 |
+
"step": 25530
|
| 17961 |
+
},
|
| 17962 |
+
{
|
| 17963 |
+
"epoch": 0.64,
|
| 17964 |
+
"grad_norm": 7.098801612854004,
|
| 17965 |
+
"learning_rate": 3.0237288135593223e-06,
|
| 17966 |
+
"loss": 1.4746,
|
| 17967 |
+
"step": 25540
|
| 17968 |
+
},
|
| 17969 |
+
{
|
| 17970 |
+
"epoch": 0.64,
|
| 17971 |
+
"grad_norm": 5.496173858642578,
|
| 17972 |
+
"learning_rate": 3.0169491525423733e-06,
|
| 17973 |
+
"loss": 1.296,
|
| 17974 |
+
"step": 25550
|
| 17975 |
+
},
|
| 17976 |
+
{
|
| 17977 |
+
"epoch": 0.64,
|
| 17978 |
+
"grad_norm": 3.706704616546631,
|
| 17979 |
+
"learning_rate": 3.010169491525424e-06,
|
| 17980 |
+
"loss": 1.4458,
|
| 17981 |
+
"step": 25560
|
| 17982 |
+
},
|
| 17983 |
+
{
|
| 17984 |
+
"epoch": 0.64,
|
| 17985 |
+
"grad_norm": 10.640968322753906,
|
| 17986 |
+
"learning_rate": 3.003389830508475e-06,
|
| 17987 |
+
"loss": 1.2303,
|
| 17988 |
+
"step": 25570
|
| 17989 |
+
},
|
| 17990 |
+
{
|
| 17991 |
+
"epoch": 0.64,
|
| 17992 |
+
"grad_norm": 9.76960563659668,
|
| 17993 |
+
"learning_rate": 2.9966101694915258e-06,
|
| 17994 |
+
"loss": 1.3352,
|
| 17995 |
+
"step": 25580
|
| 17996 |
+
},
|
| 17997 |
+
{
|
| 17998 |
+
"epoch": 0.64,
|
| 17999 |
+
"grad_norm": 6.274062633514404,
|
| 18000 |
+
"learning_rate": 2.9898305084745768e-06,
|
| 18001 |
+
"loss": 1.4795,
|
| 18002 |
+
"step": 25590
|
| 18003 |
+
},
|
| 18004 |
+
{
|
| 18005 |
+
"epoch": 0.64,
|
| 18006 |
+
"grad_norm": 2.7021098136901855,
|
| 18007 |
+
"learning_rate": 2.9830508474576277e-06,
|
| 18008 |
+
"loss": 1.3396,
|
| 18009 |
+
"step": 25600
|
| 18010 |
+
},
|
| 18011 |
+
{
|
| 18012 |
+
"epoch": 0.64,
|
| 18013 |
+
"grad_norm": 4.872988224029541,
|
| 18014 |
+
"learning_rate": 2.9762711864406783e-06,
|
| 18015 |
+
"loss": 1.4767,
|
| 18016 |
+
"step": 25610
|
| 18017 |
+
},
|
| 18018 |
+
{
|
| 18019 |
+
"epoch": 0.64,
|
| 18020 |
+
"grad_norm": 8.984478950500488,
|
| 18021 |
+
"learning_rate": 2.969491525423729e-06,
|
| 18022 |
+
"loss": 1.3939,
|
| 18023 |
+
"step": 25620
|
| 18024 |
+
},
|
| 18025 |
+
{
|
| 18026 |
+
"epoch": 0.64,
|
| 18027 |
+
"grad_norm": 18.204336166381836,
|
| 18028 |
+
"learning_rate": 2.96271186440678e-06,
|
| 18029 |
+
"loss": 1.3875,
|
| 18030 |
+
"step": 25630
|
| 18031 |
+
},
|
| 18032 |
+
{
|
| 18033 |
+
"epoch": 0.64,
|
| 18034 |
+
"grad_norm": 6.397688388824463,
|
| 18035 |
+
"learning_rate": 2.955932203389831e-06,
|
| 18036 |
+
"loss": 1.1992,
|
| 18037 |
+
"step": 25640
|
| 18038 |
+
},
|
| 18039 |
+
{
|
| 18040 |
+
"epoch": 0.64,
|
| 18041 |
+
"grad_norm": 3.8524389266967773,
|
| 18042 |
+
"learning_rate": 2.9491525423728818e-06,
|
| 18043 |
+
"loss": 1.5454,
|
| 18044 |
+
"step": 25650
|
| 18045 |
+
},
|
| 18046 |
+
{
|
| 18047 |
+
"epoch": 0.64,
|
| 18048 |
+
"grad_norm": 6.4902191162109375,
|
| 18049 |
+
"learning_rate": 2.9423728813559327e-06,
|
| 18050 |
+
"loss": 1.3023,
|
| 18051 |
+
"step": 25660
|
| 18052 |
+
},
|
| 18053 |
+
{
|
| 18054 |
+
"epoch": 0.64,
|
| 18055 |
+
"grad_norm": 2.5433766841888428,
|
| 18056 |
+
"learning_rate": 2.935593220338983e-06,
|
| 18057 |
+
"loss": 1.3424,
|
| 18058 |
+
"step": 25670
|
| 18059 |
+
},
|
| 18060 |
+
{
|
| 18061 |
+
"epoch": 0.64,
|
| 18062 |
+
"grad_norm": 2.7309176921844482,
|
| 18063 |
+
"learning_rate": 2.928813559322034e-06,
|
| 18064 |
+
"loss": 1.2601,
|
| 18065 |
+
"step": 25680
|
| 18066 |
+
},
|
| 18067 |
+
{
|
| 18068 |
+
"epoch": 0.64,
|
| 18069 |
+
"grad_norm": 5.759544849395752,
|
| 18070 |
+
"learning_rate": 2.922033898305085e-06,
|
| 18071 |
+
"loss": 1.4213,
|
| 18072 |
+
"step": 25690
|
| 18073 |
+
},
|
| 18074 |
+
{
|
| 18075 |
+
"epoch": 0.64,
|
| 18076 |
+
"grad_norm": 8.862116813659668,
|
| 18077 |
+
"learning_rate": 2.915254237288136e-06,
|
| 18078 |
+
"loss": 1.2039,
|
| 18079 |
+
"step": 25700
|
| 18080 |
+
},
|
| 18081 |
+
{
|
| 18082 |
+
"epoch": 0.64,
|
| 18083 |
+
"grad_norm": 4.360088348388672,
|
| 18084 |
+
"learning_rate": 2.9084745762711868e-06,
|
| 18085 |
+
"loss": 1.2741,
|
| 18086 |
+
"step": 25710
|
| 18087 |
+
},
|
| 18088 |
+
{
|
| 18089 |
+
"epoch": 0.64,
|
| 18090 |
+
"grad_norm": 6.417861461639404,
|
| 18091 |
+
"learning_rate": 2.9016949152542373e-06,
|
| 18092 |
+
"loss": 1.173,
|
| 18093 |
+
"step": 25720
|
| 18094 |
+
},
|
| 18095 |
+
{
|
| 18096 |
+
"epoch": 0.64,
|
| 18097 |
+
"grad_norm": 4.8081254959106445,
|
| 18098 |
+
"learning_rate": 2.8949152542372883e-06,
|
| 18099 |
+
"loss": 1.3977,
|
| 18100 |
+
"step": 25730
|
| 18101 |
+
},
|
| 18102 |
+
{
|
| 18103 |
+
"epoch": 0.64,
|
| 18104 |
+
"grad_norm": 4.581600189208984,
|
| 18105 |
+
"learning_rate": 2.8881355932203393e-06,
|
| 18106 |
+
"loss": 1.2773,
|
| 18107 |
+
"step": 25740
|
| 18108 |
+
},
|
| 18109 |
+
{
|
| 18110 |
+
"epoch": 0.64,
|
| 18111 |
+
"grad_norm": 7.737278938293457,
|
| 18112 |
+
"learning_rate": 2.8813559322033903e-06,
|
| 18113 |
+
"loss": 1.3239,
|
| 18114 |
+
"step": 25750
|
| 18115 |
+
},
|
| 18116 |
+
{
|
| 18117 |
+
"epoch": 0.64,
|
| 18118 |
+
"grad_norm": 8.583956718444824,
|
| 18119 |
+
"learning_rate": 2.8745762711864412e-06,
|
| 18120 |
+
"loss": 1.3715,
|
| 18121 |
+
"step": 25760
|
| 18122 |
+
},
|
| 18123 |
+
{
|
| 18124 |
+
"epoch": 0.64,
|
| 18125 |
+
"grad_norm": 6.567660331726074,
|
| 18126 |
+
"learning_rate": 2.8677966101694914e-06,
|
| 18127 |
+
"loss": 1.0883,
|
| 18128 |
+
"step": 25770
|
| 18129 |
+
},
|
| 18130 |
+
{
|
| 18131 |
+
"epoch": 0.64,
|
| 18132 |
+
"grad_norm": 8.99410343170166,
|
| 18133 |
+
"learning_rate": 2.8610169491525424e-06,
|
| 18134 |
+
"loss": 1.213,
|
| 18135 |
+
"step": 25780
|
| 18136 |
+
},
|
| 18137 |
+
{
|
| 18138 |
+
"epoch": 0.64,
|
| 18139 |
+
"grad_norm": 7.688558101654053,
|
| 18140 |
+
"learning_rate": 2.8542372881355933e-06,
|
| 18141 |
+
"loss": 1.147,
|
| 18142 |
+
"step": 25790
|
| 18143 |
+
},
|
| 18144 |
+
{
|
| 18145 |
+
"epoch": 0.65,
|
| 18146 |
+
"grad_norm": 9.962355613708496,
|
| 18147 |
+
"learning_rate": 2.8474576271186443e-06,
|
| 18148 |
+
"loss": 1.243,
|
| 18149 |
+
"step": 25800
|
| 18150 |
+
},
|
| 18151 |
+
{
|
| 18152 |
+
"epoch": 0.65,
|
| 18153 |
+
"grad_norm": 2.2275478839874268,
|
| 18154 |
+
"learning_rate": 2.8406779661016953e-06,
|
| 18155 |
+
"loss": 1.2328,
|
| 18156 |
+
"step": 25810
|
| 18157 |
+
},
|
| 18158 |
+
{
|
| 18159 |
+
"epoch": 0.65,
|
| 18160 |
+
"grad_norm": 3.4127414226531982,
|
| 18161 |
+
"learning_rate": 2.833898305084746e-06,
|
| 18162 |
+
"loss": 1.4244,
|
| 18163 |
+
"step": 25820
|
| 18164 |
+
},
|
| 18165 |
+
{
|
| 18166 |
+
"epoch": 0.65,
|
| 18167 |
+
"grad_norm": 12.15282917022705,
|
| 18168 |
+
"learning_rate": 2.827118644067797e-06,
|
| 18169 |
+
"loss": 1.401,
|
| 18170 |
+
"step": 25830
|
| 18171 |
+
},
|
| 18172 |
+
{
|
| 18173 |
+
"epoch": 0.65,
|
| 18174 |
+
"grad_norm": 8.007610321044922,
|
| 18175 |
+
"learning_rate": 2.820338983050848e-06,
|
| 18176 |
+
"loss": 1.3701,
|
| 18177 |
+
"step": 25840
|
| 18178 |
+
},
|
| 18179 |
+
{
|
| 18180 |
+
"epoch": 0.65,
|
| 18181 |
+
"grad_norm": 9.589988708496094,
|
| 18182 |
+
"learning_rate": 2.8135593220338988e-06,
|
| 18183 |
+
"loss": 1.2886,
|
| 18184 |
+
"step": 25850
|
| 18185 |
+
},
|
| 18186 |
+
{
|
| 18187 |
+
"epoch": 0.65,
|
| 18188 |
+
"grad_norm": 4.063002109527588,
|
| 18189 |
+
"learning_rate": 2.8067796610169497e-06,
|
| 18190 |
+
"loss": 1.3383,
|
| 18191 |
+
"step": 25860
|
| 18192 |
+
},
|
| 18193 |
+
{
|
| 18194 |
+
"epoch": 0.65,
|
| 18195 |
+
"grad_norm": 2.1042330265045166,
|
| 18196 |
+
"learning_rate": 2.8000000000000003e-06,
|
| 18197 |
+
"loss": 1.2753,
|
| 18198 |
+
"step": 25870
|
| 18199 |
+
},
|
| 18200 |
+
{
|
| 18201 |
+
"epoch": 0.65,
|
| 18202 |
+
"grad_norm": 23.256053924560547,
|
| 18203 |
+
"learning_rate": 2.793220338983051e-06,
|
| 18204 |
+
"loss": 1.3617,
|
| 18205 |
+
"step": 25880
|
| 18206 |
+
},
|
| 18207 |
+
{
|
| 18208 |
+
"epoch": 0.65,
|
| 18209 |
+
"grad_norm": 7.6475911140441895,
|
| 18210 |
+
"learning_rate": 2.786440677966102e-06,
|
| 18211 |
+
"loss": 1.2862,
|
| 18212 |
+
"step": 25890
|
| 18213 |
+
},
|
| 18214 |
+
{
|
| 18215 |
+
"epoch": 0.65,
|
| 18216 |
+
"grad_norm": 16.81471824645996,
|
| 18217 |
+
"learning_rate": 2.779661016949153e-06,
|
| 18218 |
+
"loss": 1.5188,
|
| 18219 |
+
"step": 25900
|
| 18220 |
+
},
|
| 18221 |
+
{
|
| 18222 |
+
"epoch": 0.65,
|
| 18223 |
+
"grad_norm": 10.20080852508545,
|
| 18224 |
+
"learning_rate": 2.7728813559322038e-06,
|
| 18225 |
+
"loss": 1.5414,
|
| 18226 |
+
"step": 25910
|
| 18227 |
+
},
|
| 18228 |
+
{
|
| 18229 |
+
"epoch": 0.65,
|
| 18230 |
+
"grad_norm": 9.043648719787598,
|
| 18231 |
+
"learning_rate": 2.7661016949152548e-06,
|
| 18232 |
+
"loss": 1.3299,
|
| 18233 |
+
"step": 25920
|
| 18234 |
+
},
|
| 18235 |
+
{
|
| 18236 |
+
"epoch": 0.65,
|
| 18237 |
+
"grad_norm": 4.330277442932129,
|
| 18238 |
+
"learning_rate": 2.7593220338983053e-06,
|
| 18239 |
+
"loss": 1.1826,
|
| 18240 |
+
"step": 25930
|
| 18241 |
+
},
|
| 18242 |
+
{
|
| 18243 |
+
"epoch": 0.65,
|
| 18244 |
+
"grad_norm": 7.962332725524902,
|
| 18245 |
+
"learning_rate": 2.752542372881356e-06,
|
| 18246 |
+
"loss": 1.3566,
|
| 18247 |
+
"step": 25940
|
| 18248 |
+
},
|
| 18249 |
+
{
|
| 18250 |
+
"epoch": 0.65,
|
| 18251 |
+
"grad_norm": 8.763781547546387,
|
| 18252 |
+
"learning_rate": 2.745762711864407e-06,
|
| 18253 |
+
"loss": 1.3485,
|
| 18254 |
+
"step": 25950
|
| 18255 |
+
},
|
| 18256 |
+
{
|
| 18257 |
+
"epoch": 0.65,
|
| 18258 |
+
"grad_norm": 6.160745620727539,
|
| 18259 |
+
"learning_rate": 2.738983050847458e-06,
|
| 18260 |
+
"loss": 1.303,
|
| 18261 |
+
"step": 25960
|
| 18262 |
+
},
|
| 18263 |
+
{
|
| 18264 |
+
"epoch": 0.65,
|
| 18265 |
+
"grad_norm": 2.6451609134674072,
|
| 18266 |
+
"learning_rate": 2.732203389830509e-06,
|
| 18267 |
+
"loss": 1.4117,
|
| 18268 |
+
"step": 25970
|
| 18269 |
+
},
|
| 18270 |
+
{
|
| 18271 |
+
"epoch": 0.65,
|
| 18272 |
+
"grad_norm": 10.857940673828125,
|
| 18273 |
+
"learning_rate": 2.7254237288135593e-06,
|
| 18274 |
+
"loss": 1.1566,
|
| 18275 |
+
"step": 25980
|
| 18276 |
+
},
|
| 18277 |
+
{
|
| 18278 |
+
"epoch": 0.65,
|
| 18279 |
+
"grad_norm": 5.1549601554870605,
|
| 18280 |
+
"learning_rate": 2.7186440677966103e-06,
|
| 18281 |
+
"loss": 1.2738,
|
| 18282 |
+
"step": 25990
|
| 18283 |
+
},
|
| 18284 |
+
{
|
| 18285 |
+
"epoch": 0.65,
|
| 18286 |
+
"grad_norm": 7.437528610229492,
|
| 18287 |
+
"learning_rate": 2.7118644067796613e-06,
|
| 18288 |
+
"loss": 1.4507,
|
| 18289 |
+
"step": 26000
|
| 18290 |
+
},
|
| 18291 |
+
{
|
| 18292 |
+
"epoch": 0.65,
|
| 18293 |
+
"grad_norm": 6.80765962600708,
|
| 18294 |
+
"learning_rate": 2.7050847457627123e-06,
|
| 18295 |
+
"loss": 1.1497,
|
| 18296 |
+
"step": 26010
|
| 18297 |
+
},
|
| 18298 |
+
{
|
| 18299 |
+
"epoch": 0.65,
|
| 18300 |
+
"grad_norm": 6.173390865325928,
|
| 18301 |
+
"learning_rate": 2.6983050847457633e-06,
|
| 18302 |
+
"loss": 1.3426,
|
| 18303 |
+
"step": 26020
|
| 18304 |
+
},
|
| 18305 |
+
{
|
| 18306 |
+
"epoch": 0.65,
|
| 18307 |
+
"grad_norm": 6.7725911140441895,
|
| 18308 |
+
"learning_rate": 2.6915254237288134e-06,
|
| 18309 |
+
"loss": 1.3951,
|
| 18310 |
+
"step": 26030
|
| 18311 |
+
},
|
| 18312 |
+
{
|
| 18313 |
+
"epoch": 0.65,
|
| 18314 |
+
"grad_norm": 6.6503777503967285,
|
| 18315 |
+
"learning_rate": 2.6847457627118644e-06,
|
| 18316 |
+
"loss": 1.4507,
|
| 18317 |
+
"step": 26040
|
| 18318 |
+
},
|
| 18319 |
+
{
|
| 18320 |
+
"epoch": 0.65,
|
| 18321 |
+
"grad_norm": 5.210537433624268,
|
| 18322 |
+
"learning_rate": 2.6779661016949153e-06,
|
| 18323 |
+
"loss": 1.3876,
|
| 18324 |
+
"step": 26050
|
| 18325 |
+
},
|
| 18326 |
+
{
|
| 18327 |
+
"epoch": 0.65,
|
| 18328 |
+
"grad_norm": 3.615936756134033,
|
| 18329 |
+
"learning_rate": 2.6711864406779663e-06,
|
| 18330 |
+
"loss": 1.4476,
|
| 18331 |
+
"step": 26060
|
| 18332 |
+
},
|
| 18333 |
+
{
|
| 18334 |
+
"epoch": 0.65,
|
| 18335 |
+
"grad_norm": 9.065774917602539,
|
| 18336 |
+
"learning_rate": 2.6644067796610173e-06,
|
| 18337 |
+
"loss": 1.3124,
|
| 18338 |
+
"step": 26070
|
| 18339 |
+
},
|
| 18340 |
+
{
|
| 18341 |
+
"epoch": 0.65,
|
| 18342 |
+
"grad_norm": 13.187819480895996,
|
| 18343 |
+
"learning_rate": 2.657627118644068e-06,
|
| 18344 |
+
"loss": 1.2038,
|
| 18345 |
+
"step": 26080
|
| 18346 |
+
},
|
| 18347 |
+
{
|
| 18348 |
+
"epoch": 0.65,
|
| 18349 |
+
"grad_norm": 13.807534217834473,
|
| 18350 |
+
"learning_rate": 2.650847457627119e-06,
|
| 18351 |
+
"loss": 1.2118,
|
| 18352 |
+
"step": 26090
|
| 18353 |
+
},
|
| 18354 |
+
{
|
| 18355 |
+
"epoch": 0.65,
|
| 18356 |
+
"grad_norm": 16.19401741027832,
|
| 18357 |
+
"learning_rate": 2.64406779661017e-06,
|
| 18358 |
+
"loss": 1.2734,
|
| 18359 |
+
"step": 26100
|
| 18360 |
+
},
|
| 18361 |
+
{
|
| 18362 |
+
"epoch": 0.65,
|
| 18363 |
+
"grad_norm": 7.493824005126953,
|
| 18364 |
+
"learning_rate": 2.6372881355932208e-06,
|
| 18365 |
+
"loss": 1.3692,
|
| 18366 |
+
"step": 26110
|
| 18367 |
+
},
|
| 18368 |
+
{
|
| 18369 |
+
"epoch": 0.65,
|
| 18370 |
+
"grad_norm": 16.29339599609375,
|
| 18371 |
+
"learning_rate": 2.6305084745762718e-06,
|
| 18372 |
+
"loss": 1.3132,
|
| 18373 |
+
"step": 26120
|
| 18374 |
+
},
|
| 18375 |
+
{
|
| 18376 |
+
"epoch": 0.65,
|
| 18377 |
+
"grad_norm": 6.331333160400391,
|
| 18378 |
+
"learning_rate": 2.6237288135593223e-06,
|
| 18379 |
+
"loss": 1.3203,
|
| 18380 |
+
"step": 26130
|
| 18381 |
+
},
|
| 18382 |
+
{
|
| 18383 |
+
"epoch": 0.65,
|
| 18384 |
+
"grad_norm": 6.448307991027832,
|
| 18385 |
+
"learning_rate": 2.616949152542373e-06,
|
| 18386 |
+
"loss": 1.2753,
|
| 18387 |
+
"step": 26140
|
| 18388 |
+
},
|
| 18389 |
+
{
|
| 18390 |
+
"epoch": 0.65,
|
| 18391 |
+
"grad_norm": 6.896134376525879,
|
| 18392 |
+
"learning_rate": 2.610169491525424e-06,
|
| 18393 |
+
"loss": 1.349,
|
| 18394 |
+
"step": 26150
|
| 18395 |
+
},
|
| 18396 |
+
{
|
| 18397 |
+
"epoch": 0.65,
|
| 18398 |
+
"grad_norm": 5.189770698547363,
|
| 18399 |
+
"learning_rate": 2.603389830508475e-06,
|
| 18400 |
+
"loss": 1.289,
|
| 18401 |
+
"step": 26160
|
| 18402 |
+
},
|
| 18403 |
+
{
|
| 18404 |
+
"epoch": 0.65,
|
| 18405 |
+
"grad_norm": 1.7721081972122192,
|
| 18406 |
+
"learning_rate": 2.596610169491526e-06,
|
| 18407 |
+
"loss": 1.4432,
|
| 18408 |
+
"step": 26170
|
| 18409 |
+
},
|
| 18410 |
+
{
|
| 18411 |
+
"epoch": 0.65,
|
| 18412 |
+
"grad_norm": 3.9717156887054443,
|
| 18413 |
+
"learning_rate": 2.5898305084745768e-06,
|
| 18414 |
+
"loss": 1.2982,
|
| 18415 |
+
"step": 26180
|
| 18416 |
+
},
|
| 18417 |
+
{
|
| 18418 |
+
"epoch": 0.65,
|
| 18419 |
+
"grad_norm": 19.084896087646484,
|
| 18420 |
+
"learning_rate": 2.5830508474576273e-06,
|
| 18421 |
+
"loss": 1.3897,
|
| 18422 |
+
"step": 26190
|
| 18423 |
+
},
|
| 18424 |
+
{
|
| 18425 |
+
"epoch": 0.66,
|
| 18426 |
+
"grad_norm": 1.6735248565673828,
|
| 18427 |
+
"learning_rate": 2.576271186440678e-06,
|
| 18428 |
+
"loss": 1.4137,
|
| 18429 |
+
"step": 26200
|
| 18430 |
+
},
|
| 18431 |
+
{
|
| 18432 |
+
"epoch": 0.66,
|
| 18433 |
+
"grad_norm": 8.025382041931152,
|
| 18434 |
+
"learning_rate": 2.569491525423729e-06,
|
| 18435 |
+
"loss": 1.2629,
|
| 18436 |
+
"step": 26210
|
| 18437 |
+
},
|
| 18438 |
+
{
|
| 18439 |
+
"epoch": 0.66,
|
| 18440 |
+
"grad_norm": 4.695014476776123,
|
| 18441 |
+
"learning_rate": 2.56271186440678e-06,
|
| 18442 |
+
"loss": 1.5102,
|
| 18443 |
+
"step": 26220
|
| 18444 |
+
},
|
| 18445 |
+
{
|
| 18446 |
+
"epoch": 0.66,
|
| 18447 |
+
"grad_norm": 4.135346412658691,
|
| 18448 |
+
"learning_rate": 2.555932203389831e-06,
|
| 18449 |
+
"loss": 1.1628,
|
| 18450 |
+
"step": 26230
|
| 18451 |
+
},
|
| 18452 |
+
{
|
| 18453 |
+
"epoch": 0.66,
|
| 18454 |
+
"grad_norm": 6.607401371002197,
|
| 18455 |
+
"learning_rate": 2.5491525423728814e-06,
|
| 18456 |
+
"loss": 1.305,
|
| 18457 |
+
"step": 26240
|
| 18458 |
+
},
|
| 18459 |
+
{
|
| 18460 |
+
"epoch": 0.66,
|
| 18461 |
+
"grad_norm": 17.407390594482422,
|
| 18462 |
+
"learning_rate": 2.5423728813559323e-06,
|
| 18463 |
+
"loss": 1.3841,
|
| 18464 |
+
"step": 26250
|
| 18465 |
+
},
|
| 18466 |
+
{
|
| 18467 |
+
"epoch": 0.66,
|
| 18468 |
+
"grad_norm": 13.363433837890625,
|
| 18469 |
+
"learning_rate": 2.5355932203389833e-06,
|
| 18470 |
+
"loss": 1.4593,
|
| 18471 |
+
"step": 26260
|
| 18472 |
+
},
|
| 18473 |
+
{
|
| 18474 |
+
"epoch": 0.66,
|
| 18475 |
+
"grad_norm": 4.77979040145874,
|
| 18476 |
+
"learning_rate": 2.5288135593220343e-06,
|
| 18477 |
+
"loss": 1.2445,
|
| 18478 |
+
"step": 26270
|
| 18479 |
+
},
|
| 18480 |
+
{
|
| 18481 |
+
"epoch": 0.66,
|
| 18482 |
+
"grad_norm": 10.652926445007324,
|
| 18483 |
+
"learning_rate": 2.5220338983050853e-06,
|
| 18484 |
+
"loss": 1.3092,
|
| 18485 |
+
"step": 26280
|
| 18486 |
+
},
|
| 18487 |
+
{
|
| 18488 |
+
"epoch": 0.66,
|
| 18489 |
+
"grad_norm": 5.278314113616943,
|
| 18490 |
+
"learning_rate": 2.5152542372881354e-06,
|
| 18491 |
+
"loss": 1.447,
|
| 18492 |
+
"step": 26290
|
| 18493 |
+
},
|
| 18494 |
+
{
|
| 18495 |
+
"epoch": 0.66,
|
| 18496 |
+
"grad_norm": 6.439229488372803,
|
| 18497 |
+
"learning_rate": 2.5084745762711864e-06,
|
| 18498 |
+
"loss": 1.3421,
|
| 18499 |
+
"step": 26300
|
| 18500 |
+
},
|
| 18501 |
+
{
|
| 18502 |
+
"epoch": 0.66,
|
| 18503 |
+
"grad_norm": 4.738833904266357,
|
| 18504 |
+
"learning_rate": 2.5016949152542374e-06,
|
| 18505 |
+
"loss": 1.3454,
|
| 18506 |
+
"step": 26310
|
| 18507 |
+
},
|
| 18508 |
+
{
|
| 18509 |
+
"epoch": 0.66,
|
| 18510 |
+
"grad_norm": 4.070488929748535,
|
| 18511 |
+
"learning_rate": 2.4949152542372883e-06,
|
| 18512 |
+
"loss": 1.377,
|
| 18513 |
+
"step": 26320
|
| 18514 |
+
},
|
| 18515 |
+
{
|
| 18516 |
+
"epoch": 0.66,
|
| 18517 |
+
"grad_norm": 1.9005275964736938,
|
| 18518 |
+
"learning_rate": 2.488135593220339e-06,
|
| 18519 |
+
"loss": 1.4501,
|
| 18520 |
+
"step": 26330
|
| 18521 |
+
},
|
| 18522 |
+
{
|
| 18523 |
+
"epoch": 0.66,
|
| 18524 |
+
"grad_norm": 9.970990180969238,
|
| 18525 |
+
"learning_rate": 2.48135593220339e-06,
|
| 18526 |
+
"loss": 1.1534,
|
| 18527 |
+
"step": 26340
|
| 18528 |
+
},
|
| 18529 |
+
{
|
| 18530 |
+
"epoch": 0.66,
|
| 18531 |
+
"grad_norm": 2.7662065029144287,
|
| 18532 |
+
"learning_rate": 2.474576271186441e-06,
|
| 18533 |
+
"loss": 1.4422,
|
| 18534 |
+
"step": 26350
|
| 18535 |
+
},
|
| 18536 |
+
{
|
| 18537 |
+
"epoch": 0.66,
|
| 18538 |
+
"grad_norm": 11.093968391418457,
|
| 18539 |
+
"learning_rate": 2.467796610169492e-06,
|
| 18540 |
+
"loss": 1.198,
|
| 18541 |
+
"step": 26360
|
| 18542 |
+
},
|
| 18543 |
+
{
|
| 18544 |
+
"epoch": 0.66,
|
| 18545 |
+
"grad_norm": 6.7317280769348145,
|
| 18546 |
+
"learning_rate": 2.461016949152543e-06,
|
| 18547 |
+
"loss": 1.244,
|
| 18548 |
+
"step": 26370
|
| 18549 |
+
},
|
| 18550 |
+
{
|
| 18551 |
+
"epoch": 0.66,
|
| 18552 |
+
"grad_norm": 8.76866340637207,
|
| 18553 |
+
"learning_rate": 2.4542372881355933e-06,
|
| 18554 |
+
"loss": 1.3653,
|
| 18555 |
+
"step": 26380
|
| 18556 |
+
},
|
| 18557 |
+
{
|
| 18558 |
+
"epoch": 0.66,
|
| 18559 |
+
"grad_norm": 11.940791130065918,
|
| 18560 |
+
"learning_rate": 2.4474576271186443e-06,
|
| 18561 |
+
"loss": 1.304,
|
| 18562 |
+
"step": 26390
|
| 18563 |
+
},
|
| 18564 |
+
{
|
| 18565 |
+
"epoch": 0.66,
|
| 18566 |
+
"grad_norm": 6.687407970428467,
|
| 18567 |
+
"learning_rate": 2.4406779661016953e-06,
|
| 18568 |
+
"loss": 1.1965,
|
| 18569 |
+
"step": 26400
|
| 18570 |
+
},
|
| 18571 |
+
{
|
| 18572 |
+
"epoch": 0.66,
|
| 18573 |
+
"grad_norm": 5.42927885055542,
|
| 18574 |
+
"learning_rate": 2.433898305084746e-06,
|
| 18575 |
+
"loss": 1.3501,
|
| 18576 |
+
"step": 26410
|
| 18577 |
+
},
|
| 18578 |
+
{
|
| 18579 |
+
"epoch": 0.66,
|
| 18580 |
+
"grad_norm": 2.731924057006836,
|
| 18581 |
+
"learning_rate": 2.427118644067797e-06,
|
| 18582 |
+
"loss": 1.4572,
|
| 18583 |
+
"step": 26420
|
| 18584 |
+
},
|
| 18585 |
+
{
|
| 18586 |
+
"epoch": 0.66,
|
| 18587 |
+
"grad_norm": 5.305939197540283,
|
| 18588 |
+
"learning_rate": 2.4203389830508474e-06,
|
| 18589 |
+
"loss": 1.3428,
|
| 18590 |
+
"step": 26430
|
| 18591 |
+
},
|
| 18592 |
+
{
|
| 18593 |
+
"epoch": 0.66,
|
| 18594 |
+
"grad_norm": 10.32532787322998,
|
| 18595 |
+
"learning_rate": 2.4135593220338984e-06,
|
| 18596 |
+
"loss": 1.4827,
|
| 18597 |
+
"step": 26440
|
| 18598 |
+
},
|
| 18599 |
+
{
|
| 18600 |
+
"epoch": 0.66,
|
| 18601 |
+
"grad_norm": 7.55979585647583,
|
| 18602 |
+
"learning_rate": 2.4067796610169493e-06,
|
| 18603 |
+
"loss": 1.2877,
|
| 18604 |
+
"step": 26450
|
| 18605 |
+
},
|
| 18606 |
+
{
|
| 18607 |
+
"epoch": 0.66,
|
| 18608 |
+
"grad_norm": 9.092228889465332,
|
| 18609 |
+
"learning_rate": 2.4000000000000003e-06,
|
| 18610 |
+
"loss": 1.3516,
|
| 18611 |
+
"step": 26460
|
| 18612 |
+
},
|
| 18613 |
+
{
|
| 18614 |
+
"epoch": 0.66,
|
| 18615 |
+
"grad_norm": 4.732894420623779,
|
| 18616 |
+
"learning_rate": 2.393220338983051e-06,
|
| 18617 |
+
"loss": 1.3218,
|
| 18618 |
+
"step": 26470
|
| 18619 |
+
},
|
| 18620 |
+
{
|
| 18621 |
+
"epoch": 0.66,
|
| 18622 |
+
"grad_norm": 8.649917602539062,
|
| 18623 |
+
"learning_rate": 2.386440677966102e-06,
|
| 18624 |
+
"loss": 1.2708,
|
| 18625 |
+
"step": 26480
|
| 18626 |
+
},
|
| 18627 |
+
{
|
| 18628 |
+
"epoch": 0.66,
|
| 18629 |
+
"grad_norm": 4.569608211517334,
|
| 18630 |
+
"learning_rate": 2.379661016949153e-06,
|
| 18631 |
+
"loss": 1.3422,
|
| 18632 |
+
"step": 26490
|
| 18633 |
+
},
|
| 18634 |
+
{
|
| 18635 |
+
"epoch": 0.66,
|
| 18636 |
+
"grad_norm": 3.702059030532837,
|
| 18637 |
+
"learning_rate": 2.372881355932204e-06,
|
| 18638 |
+
"loss": 1.4754,
|
| 18639 |
+
"step": 26500
|
| 18640 |
+
},
|
| 18641 |
+
{
|
| 18642 |
+
"epoch": 0.66,
|
| 18643 |
+
"grad_norm": 3.9777114391326904,
|
| 18644 |
+
"learning_rate": 2.3661016949152544e-06,
|
| 18645 |
+
"loss": 1.3133,
|
| 18646 |
+
"step": 26510
|
| 18647 |
+
},
|
| 18648 |
+
{
|
| 18649 |
+
"epoch": 0.66,
|
| 18650 |
+
"grad_norm": 9.692605018615723,
|
| 18651 |
+
"learning_rate": 2.3593220338983053e-06,
|
| 18652 |
+
"loss": 1.2866,
|
| 18653 |
+
"step": 26520
|
| 18654 |
+
},
|
| 18655 |
+
{
|
| 18656 |
+
"epoch": 0.66,
|
| 18657 |
+
"grad_norm": 2.1870622634887695,
|
| 18658 |
+
"learning_rate": 2.3525423728813563e-06,
|
| 18659 |
+
"loss": 1.4271,
|
| 18660 |
+
"step": 26530
|
| 18661 |
+
},
|
| 18662 |
+
{
|
| 18663 |
+
"epoch": 0.66,
|
| 18664 |
+
"grad_norm": 6.799996852874756,
|
| 18665 |
+
"learning_rate": 2.345762711864407e-06,
|
| 18666 |
+
"loss": 1.3768,
|
| 18667 |
+
"step": 26540
|
| 18668 |
+
},
|
| 18669 |
+
{
|
| 18670 |
+
"epoch": 0.66,
|
| 18671 |
+
"grad_norm": 2.3258345127105713,
|
| 18672 |
+
"learning_rate": 2.338983050847458e-06,
|
| 18673 |
+
"loss": 1.4023,
|
| 18674 |
+
"step": 26550
|
| 18675 |
+
},
|
| 18676 |
+
{
|
| 18677 |
+
"epoch": 0.66,
|
| 18678 |
+
"grad_norm": 3.950892925262451,
|
| 18679 |
+
"learning_rate": 2.3322033898305084e-06,
|
| 18680 |
+
"loss": 1.2986,
|
| 18681 |
+
"step": 26560
|
| 18682 |
+
},
|
| 18683 |
+
{
|
| 18684 |
+
"epoch": 0.66,
|
| 18685 |
+
"grad_norm": 11.114343643188477,
|
| 18686 |
+
"learning_rate": 2.3254237288135594e-06,
|
| 18687 |
+
"loss": 1.3538,
|
| 18688 |
+
"step": 26570
|
| 18689 |
+
},
|
| 18690 |
+
{
|
| 18691 |
+
"epoch": 0.66,
|
| 18692 |
+
"grad_norm": 8.47208023071289,
|
| 18693 |
+
"learning_rate": 2.3186440677966103e-06,
|
| 18694 |
+
"loss": 1.3557,
|
| 18695 |
+
"step": 26580
|
| 18696 |
+
},
|
| 18697 |
+
{
|
| 18698 |
+
"epoch": 0.66,
|
| 18699 |
+
"grad_norm": 15.794944763183594,
|
| 18700 |
+
"learning_rate": 2.3118644067796613e-06,
|
| 18701 |
+
"loss": 1.3295,
|
| 18702 |
+
"step": 26590
|
| 18703 |
+
},
|
| 18704 |
+
{
|
| 18705 |
+
"epoch": 0.67,
|
| 18706 |
+
"grad_norm": 5.596043586730957,
|
| 18707 |
+
"learning_rate": 2.305084745762712e-06,
|
| 18708 |
+
"loss": 1.2669,
|
| 18709 |
+
"step": 26600
|
| 18710 |
+
},
|
| 18711 |
+
{
|
| 18712 |
+
"epoch": 0.67,
|
| 18713 |
+
"grad_norm": 8.727288246154785,
|
| 18714 |
+
"learning_rate": 2.298305084745763e-06,
|
| 18715 |
+
"loss": 1.339,
|
| 18716 |
+
"step": 26610
|
| 18717 |
+
},
|
| 18718 |
+
{
|
| 18719 |
+
"epoch": 0.67,
|
| 18720 |
+
"grad_norm": 10.842510223388672,
|
| 18721 |
+
"learning_rate": 2.291525423728814e-06,
|
| 18722 |
+
"loss": 1.2154,
|
| 18723 |
+
"step": 26620
|
| 18724 |
+
},
|
| 18725 |
+
{
|
| 18726 |
+
"epoch": 0.67,
|
| 18727 |
+
"grad_norm": 11.826702117919922,
|
| 18728 |
+
"learning_rate": 2.284745762711865e-06,
|
| 18729 |
+
"loss": 1.1599,
|
| 18730 |
+
"step": 26630
|
| 18731 |
+
},
|
| 18732 |
+
{
|
| 18733 |
+
"epoch": 0.67,
|
| 18734 |
+
"grad_norm": 16.47806167602539,
|
| 18735 |
+
"learning_rate": 2.2779661016949154e-06,
|
| 18736 |
+
"loss": 1.3005,
|
| 18737 |
+
"step": 26640
|
| 18738 |
+
},
|
| 18739 |
+
{
|
| 18740 |
+
"epoch": 0.67,
|
| 18741 |
+
"grad_norm": 4.797351837158203,
|
| 18742 |
+
"learning_rate": 2.2711864406779663e-06,
|
| 18743 |
+
"loss": 1.2779,
|
| 18744 |
+
"step": 26650
|
| 18745 |
+
},
|
| 18746 |
+
{
|
| 18747 |
+
"epoch": 0.67,
|
| 18748 |
+
"grad_norm": 6.353465557098389,
|
| 18749 |
+
"learning_rate": 2.2644067796610173e-06,
|
| 18750 |
+
"loss": 1.2419,
|
| 18751 |
+
"step": 26660
|
| 18752 |
+
},
|
| 18753 |
+
{
|
| 18754 |
+
"epoch": 0.67,
|
| 18755 |
+
"grad_norm": 10.895663261413574,
|
| 18756 |
+
"learning_rate": 2.257627118644068e-06,
|
| 18757 |
+
"loss": 1.2078,
|
| 18758 |
+
"step": 26670
|
| 18759 |
+
},
|
| 18760 |
+
{
|
| 18761 |
+
"epoch": 0.67,
|
| 18762 |
+
"grad_norm": 7.583923816680908,
|
| 18763 |
+
"learning_rate": 2.250847457627119e-06,
|
| 18764 |
+
"loss": 1.3381,
|
| 18765 |
+
"step": 26680
|
| 18766 |
+
},
|
| 18767 |
+
{
|
| 18768 |
+
"epoch": 0.67,
|
| 18769 |
+
"grad_norm": 10.84875774383545,
|
| 18770 |
+
"learning_rate": 2.2440677966101694e-06,
|
| 18771 |
+
"loss": 1.4887,
|
| 18772 |
+
"step": 26690
|
| 18773 |
+
},
|
| 18774 |
+
{
|
| 18775 |
+
"epoch": 0.67,
|
| 18776 |
+
"grad_norm": 5.171149253845215,
|
| 18777 |
+
"learning_rate": 2.2372881355932204e-06,
|
| 18778 |
+
"loss": 1.3469,
|
| 18779 |
+
"step": 26700
|
| 18780 |
+
},
|
| 18781 |
+
{
|
| 18782 |
+
"epoch": 0.67,
|
| 18783 |
+
"grad_norm": 6.136636734008789,
|
| 18784 |
+
"learning_rate": 2.2305084745762714e-06,
|
| 18785 |
+
"loss": 1.1804,
|
| 18786 |
+
"step": 26710
|
| 18787 |
+
},
|
| 18788 |
+
{
|
| 18789 |
+
"epoch": 0.67,
|
| 18790 |
+
"grad_norm": 12.95764446258545,
|
| 18791 |
+
"learning_rate": 2.2237288135593223e-06,
|
| 18792 |
+
"loss": 1.2808,
|
| 18793 |
+
"step": 26720
|
| 18794 |
+
},
|
| 18795 |
+
{
|
| 18796 |
+
"epoch": 0.67,
|
| 18797 |
+
"grad_norm": 4.0281453132629395,
|
| 18798 |
+
"learning_rate": 2.216949152542373e-06,
|
| 18799 |
+
"loss": 1.4454,
|
| 18800 |
+
"step": 26730
|
| 18801 |
+
},
|
| 18802 |
+
{
|
| 18803 |
+
"epoch": 0.67,
|
| 18804 |
+
"grad_norm": 5.7566609382629395,
|
| 18805 |
+
"learning_rate": 2.210169491525424e-06,
|
| 18806 |
+
"loss": 1.2968,
|
| 18807 |
+
"step": 26740
|
| 18808 |
+
},
|
| 18809 |
+
{
|
| 18810 |
+
"epoch": 0.67,
|
| 18811 |
+
"grad_norm": 4.710749626159668,
|
| 18812 |
+
"learning_rate": 2.203389830508475e-06,
|
| 18813 |
+
"loss": 1.355,
|
| 18814 |
+
"step": 26750
|
| 18815 |
+
},
|
| 18816 |
+
{
|
| 18817 |
+
"epoch": 0.67,
|
| 18818 |
+
"grad_norm": 3.0553205013275146,
|
| 18819 |
+
"learning_rate": 2.196610169491526e-06,
|
| 18820 |
+
"loss": 1.3319,
|
| 18821 |
+
"step": 26760
|
| 18822 |
+
},
|
| 18823 |
+
{
|
| 18824 |
+
"epoch": 0.67,
|
| 18825 |
+
"grad_norm": 15.849903106689453,
|
| 18826 |
+
"learning_rate": 2.1898305084745764e-06,
|
| 18827 |
+
"loss": 1.3233,
|
| 18828 |
+
"step": 26770
|
| 18829 |
+
},
|
| 18830 |
+
{
|
| 18831 |
+
"epoch": 0.67,
|
| 18832 |
+
"grad_norm": 31.49736785888672,
|
| 18833 |
+
"learning_rate": 2.1830508474576273e-06,
|
| 18834 |
+
"loss": 1.3991,
|
| 18835 |
+
"step": 26780
|
| 18836 |
+
},
|
| 18837 |
+
{
|
| 18838 |
+
"epoch": 0.67,
|
| 18839 |
+
"grad_norm": 11.734864234924316,
|
| 18840 |
+
"learning_rate": 2.1762711864406783e-06,
|
| 18841 |
+
"loss": 1.5305,
|
| 18842 |
+
"step": 26790
|
| 18843 |
+
},
|
| 18844 |
+
{
|
| 18845 |
+
"epoch": 0.67,
|
| 18846 |
+
"grad_norm": 6.124046325683594,
|
| 18847 |
+
"learning_rate": 2.169491525423729e-06,
|
| 18848 |
+
"loss": 1.249,
|
| 18849 |
+
"step": 26800
|
| 18850 |
+
},
|
| 18851 |
+
{
|
| 18852 |
+
"epoch": 0.67,
|
| 18853 |
+
"grad_norm": 11.438417434692383,
|
| 18854 |
+
"learning_rate": 2.16271186440678e-06,
|
| 18855 |
+
"loss": 1.2922,
|
| 18856 |
+
"step": 26810
|
| 18857 |
+
},
|
| 18858 |
+
{
|
| 18859 |
+
"epoch": 0.67,
|
| 18860 |
+
"grad_norm": 12.979373931884766,
|
| 18861 |
+
"learning_rate": 2.1559322033898304e-06,
|
| 18862 |
+
"loss": 1.3554,
|
| 18863 |
+
"step": 26820
|
| 18864 |
+
},
|
| 18865 |
+
{
|
| 18866 |
+
"epoch": 0.67,
|
| 18867 |
+
"grad_norm": 3.8955001831054688,
|
| 18868 |
+
"learning_rate": 2.1491525423728814e-06,
|
| 18869 |
+
"loss": 1.3303,
|
| 18870 |
+
"step": 26830
|
| 18871 |
+
},
|
| 18872 |
+
{
|
| 18873 |
+
"epoch": 0.67,
|
| 18874 |
+
"grad_norm": 9.349483489990234,
|
| 18875 |
+
"learning_rate": 2.1423728813559324e-06,
|
| 18876 |
+
"loss": 1.2548,
|
| 18877 |
+
"step": 26840
|
| 18878 |
+
},
|
| 18879 |
+
{
|
| 18880 |
+
"epoch": 0.67,
|
| 18881 |
+
"grad_norm": 2.8842084407806396,
|
| 18882 |
+
"learning_rate": 2.1355932203389833e-06,
|
| 18883 |
+
"loss": 1.3517,
|
| 18884 |
+
"step": 26850
|
| 18885 |
+
},
|
| 18886 |
+
{
|
| 18887 |
+
"epoch": 0.67,
|
| 18888 |
+
"grad_norm": 3.986353635787964,
|
| 18889 |
+
"learning_rate": 2.128813559322034e-06,
|
| 18890 |
+
"loss": 1.5566,
|
| 18891 |
+
"step": 26860
|
| 18892 |
+
},
|
| 18893 |
+
{
|
| 18894 |
+
"epoch": 0.67,
|
| 18895 |
+
"grad_norm": 14.33786392211914,
|
| 18896 |
+
"learning_rate": 2.122033898305085e-06,
|
| 18897 |
+
"loss": 1.2168,
|
| 18898 |
+
"step": 26870
|
| 18899 |
+
},
|
| 18900 |
+
{
|
| 18901 |
+
"epoch": 0.67,
|
| 18902 |
+
"grad_norm": 4.677867889404297,
|
| 18903 |
+
"learning_rate": 2.115254237288136e-06,
|
| 18904 |
+
"loss": 1.2758,
|
| 18905 |
+
"step": 26880
|
| 18906 |
+
},
|
| 18907 |
+
{
|
| 18908 |
+
"epoch": 0.67,
|
| 18909 |
+
"grad_norm": 3.638185977935791,
|
| 18910 |
+
"learning_rate": 2.108474576271187e-06,
|
| 18911 |
+
"loss": 1.2348,
|
| 18912 |
+
"step": 26890
|
| 18913 |
+
},
|
| 18914 |
+
{
|
| 18915 |
+
"epoch": 0.67,
|
| 18916 |
+
"grad_norm": 2.7823917865753174,
|
| 18917 |
+
"learning_rate": 2.1016949152542374e-06,
|
| 18918 |
+
"loss": 1.4506,
|
| 18919 |
+
"step": 26900
|
| 18920 |
+
},
|
| 18921 |
+
{
|
| 18922 |
+
"epoch": 0.67,
|
| 18923 |
+
"grad_norm": 14.349405288696289,
|
| 18924 |
+
"learning_rate": 2.0949152542372883e-06,
|
| 18925 |
+
"loss": 1.2189,
|
| 18926 |
+
"step": 26910
|
| 18927 |
+
},
|
| 18928 |
+
{
|
| 18929 |
+
"epoch": 0.67,
|
| 18930 |
+
"grad_norm": 5.958116054534912,
|
| 18931 |
+
"learning_rate": 2.0881355932203393e-06,
|
| 18932 |
+
"loss": 1.0776,
|
| 18933 |
+
"step": 26920
|
| 18934 |
+
},
|
| 18935 |
+
{
|
| 18936 |
+
"epoch": 0.67,
|
| 18937 |
+
"grad_norm": 5.689637184143066,
|
| 18938 |
+
"learning_rate": 2.08135593220339e-06,
|
| 18939 |
+
"loss": 1.5885,
|
| 18940 |
+
"step": 26930
|
| 18941 |
+
},
|
| 18942 |
+
{
|
| 18943 |
+
"epoch": 0.67,
|
| 18944 |
+
"grad_norm": 17.451379776000977,
|
| 18945 |
+
"learning_rate": 2.074576271186441e-06,
|
| 18946 |
+
"loss": 1.4116,
|
| 18947 |
+
"step": 26940
|
| 18948 |
+
},
|
| 18949 |
+
{
|
| 18950 |
+
"epoch": 0.67,
|
| 18951 |
+
"grad_norm": 6.859378814697266,
|
| 18952 |
+
"learning_rate": 2.0677966101694914e-06,
|
| 18953 |
+
"loss": 1.1968,
|
| 18954 |
+
"step": 26950
|
| 18955 |
+
},
|
| 18956 |
+
{
|
| 18957 |
+
"epoch": 0.67,
|
| 18958 |
+
"grad_norm": 5.8354082107543945,
|
| 18959 |
+
"learning_rate": 2.0610169491525424e-06,
|
| 18960 |
+
"loss": 1.3615,
|
| 18961 |
+
"step": 26960
|
| 18962 |
+
},
|
| 18963 |
+
{
|
| 18964 |
+
"epoch": 0.67,
|
| 18965 |
+
"grad_norm": 4.631415367126465,
|
| 18966 |
+
"learning_rate": 2.0542372881355934e-06,
|
| 18967 |
+
"loss": 1.3517,
|
| 18968 |
+
"step": 26970
|
| 18969 |
+
},
|
| 18970 |
+
{
|
| 18971 |
+
"epoch": 0.67,
|
| 18972 |
+
"grad_norm": 9.62684154510498,
|
| 18973 |
+
"learning_rate": 2.0474576271186443e-06,
|
| 18974 |
+
"loss": 1.3364,
|
| 18975 |
+
"step": 26980
|
| 18976 |
+
},
|
| 18977 |
+
{
|
| 18978 |
+
"epoch": 0.67,
|
| 18979 |
+
"grad_norm": 4.8851447105407715,
|
| 18980 |
+
"learning_rate": 2.0406779661016953e-06,
|
| 18981 |
+
"loss": 1.2893,
|
| 18982 |
+
"step": 26990
|
| 18983 |
+
},
|
| 18984 |
+
{
|
| 18985 |
+
"epoch": 0.68,
|
| 18986 |
+
"grad_norm": 5.123071670532227,
|
| 18987 |
+
"learning_rate": 2.033898305084746e-06,
|
| 18988 |
+
"loss": 1.4069,
|
| 18989 |
+
"step": 27000
|
| 18990 |
+
},
|
| 18991 |
+
{
|
| 18992 |
+
"epoch": 0.68,
|
| 18993 |
+
"grad_norm": 3.8006324768066406,
|
| 18994 |
+
"learning_rate": 2.027118644067797e-06,
|
| 18995 |
+
"loss": 1.3013,
|
| 18996 |
+
"step": 27010
|
| 18997 |
+
},
|
| 18998 |
+
{
|
| 18999 |
+
"epoch": 0.68,
|
| 19000 |
+
"grad_norm": 3.052011728286743,
|
| 19001 |
+
"learning_rate": 2.020338983050848e-06,
|
| 19002 |
+
"loss": 1.3073,
|
| 19003 |
+
"step": 27020
|
| 19004 |
+
},
|
| 19005 |
+
{
|
| 19006 |
+
"epoch": 0.68,
|
| 19007 |
+
"grad_norm": 6.314701080322266,
|
| 19008 |
+
"learning_rate": 2.0135593220338984e-06,
|
| 19009 |
+
"loss": 1.3644,
|
| 19010 |
+
"step": 27030
|
| 19011 |
+
},
|
| 19012 |
+
{
|
| 19013 |
+
"epoch": 0.68,
|
| 19014 |
+
"grad_norm": 2.868659257888794,
|
| 19015 |
+
"learning_rate": 2.0067796610169494e-06,
|
| 19016 |
+
"loss": 1.4159,
|
| 19017 |
+
"step": 27040
|
| 19018 |
+
},
|
| 19019 |
+
{
|
| 19020 |
+
"epoch": 0.68,
|
| 19021 |
+
"grad_norm": 3.1452548503875732,
|
| 19022 |
+
"learning_rate": 2.0000000000000003e-06,
|
| 19023 |
+
"loss": 1.2184,
|
| 19024 |
+
"step": 27050
|
| 19025 |
+
},
|
| 19026 |
+
{
|
| 19027 |
+
"epoch": 0.68,
|
| 19028 |
+
"grad_norm": 7.137606620788574,
|
| 19029 |
+
"learning_rate": 1.993220338983051e-06,
|
| 19030 |
+
"loss": 1.1827,
|
| 19031 |
+
"step": 27060
|
| 19032 |
+
},
|
| 19033 |
+
{
|
| 19034 |
+
"epoch": 0.68,
|
| 19035 |
+
"grad_norm": 4.114950180053711,
|
| 19036 |
+
"learning_rate": 1.986440677966102e-06,
|
| 19037 |
+
"loss": 1.3316,
|
| 19038 |
+
"step": 27070
|
| 19039 |
+
},
|
| 19040 |
+
{
|
| 19041 |
+
"epoch": 0.68,
|
| 19042 |
+
"grad_norm": 4.815858364105225,
|
| 19043 |
+
"learning_rate": 1.9796610169491524e-06,
|
| 19044 |
+
"loss": 1.286,
|
| 19045 |
+
"step": 27080
|
| 19046 |
+
},
|
| 19047 |
+
{
|
| 19048 |
+
"epoch": 0.68,
|
| 19049 |
+
"grad_norm": 2.6551191806793213,
|
| 19050 |
+
"learning_rate": 1.9728813559322034e-06,
|
| 19051 |
+
"loss": 1.2562,
|
| 19052 |
+
"step": 27090
|
| 19053 |
+
},
|
| 19054 |
+
{
|
| 19055 |
+
"epoch": 0.68,
|
| 19056 |
+
"grad_norm": 10.009329795837402,
|
| 19057 |
+
"learning_rate": 1.9661016949152544e-06,
|
| 19058 |
+
"loss": 1.3984,
|
| 19059 |
+
"step": 27100
|
| 19060 |
+
},
|
| 19061 |
+
{
|
| 19062 |
+
"epoch": 0.68,
|
| 19063 |
+
"grad_norm": 2.3229589462280273,
|
| 19064 |
+
"learning_rate": 1.9593220338983053e-06,
|
| 19065 |
+
"loss": 1.3478,
|
| 19066 |
+
"step": 27110
|
| 19067 |
+
},
|
| 19068 |
+
{
|
| 19069 |
+
"epoch": 0.68,
|
| 19070 |
+
"grad_norm": 5.636902332305908,
|
| 19071 |
+
"learning_rate": 1.9525423728813563e-06,
|
| 19072 |
+
"loss": 1.3959,
|
| 19073 |
+
"step": 27120
|
| 19074 |
+
},
|
| 19075 |
+
{
|
| 19076 |
+
"epoch": 0.68,
|
| 19077 |
+
"grad_norm": 15.958221435546875,
|
| 19078 |
+
"learning_rate": 1.945762711864407e-06,
|
| 19079 |
+
"loss": 1.4275,
|
| 19080 |
+
"step": 27130
|
| 19081 |
+
},
|
| 19082 |
+
{
|
| 19083 |
+
"epoch": 0.68,
|
| 19084 |
+
"grad_norm": 5.097884654998779,
|
| 19085 |
+
"learning_rate": 1.938983050847458e-06,
|
| 19086 |
+
"loss": 1.3826,
|
| 19087 |
+
"step": 27140
|
| 19088 |
+
},
|
| 19089 |
+
{
|
| 19090 |
+
"epoch": 0.68,
|
| 19091 |
+
"grad_norm": 6.786471366882324,
|
| 19092 |
+
"learning_rate": 1.932203389830509e-06,
|
| 19093 |
+
"loss": 1.3073,
|
| 19094 |
+
"step": 27150
|
| 19095 |
+
},
|
| 19096 |
+
{
|
| 19097 |
+
"epoch": 0.68,
|
| 19098 |
+
"grad_norm": 7.529135704040527,
|
| 19099 |
+
"learning_rate": 1.9254237288135594e-06,
|
| 19100 |
+
"loss": 1.2605,
|
| 19101 |
+
"step": 27160
|
| 19102 |
+
},
|
| 19103 |
+
{
|
| 19104 |
+
"epoch": 0.68,
|
| 19105 |
+
"grad_norm": 10.065816879272461,
|
| 19106 |
+
"learning_rate": 1.9186440677966104e-06,
|
| 19107 |
+
"loss": 1.2377,
|
| 19108 |
+
"step": 27170
|
| 19109 |
+
},
|
| 19110 |
+
{
|
| 19111 |
+
"epoch": 0.68,
|
| 19112 |
+
"grad_norm": 5.812075614929199,
|
| 19113 |
+
"learning_rate": 1.9118644067796613e-06,
|
| 19114 |
+
"loss": 1.3309,
|
| 19115 |
+
"step": 27180
|
| 19116 |
+
},
|
| 19117 |
+
{
|
| 19118 |
+
"epoch": 0.68,
|
| 19119 |
+
"grad_norm": 4.214847564697266,
|
| 19120 |
+
"learning_rate": 1.9050847457627119e-06,
|
| 19121 |
+
"loss": 1.4063,
|
| 19122 |
+
"step": 27190
|
| 19123 |
+
},
|
| 19124 |
+
{
|
| 19125 |
+
"epoch": 0.68,
|
| 19126 |
+
"grad_norm": 4.021416187286377,
|
| 19127 |
+
"learning_rate": 1.8983050847457629e-06,
|
| 19128 |
+
"loss": 1.2773,
|
| 19129 |
+
"step": 27200
|
| 19130 |
+
},
|
| 19131 |
+
{
|
| 19132 |
+
"epoch": 0.68,
|
| 19133 |
+
"grad_norm": 7.9816083908081055,
|
| 19134 |
+
"learning_rate": 1.8915254237288136e-06,
|
| 19135 |
+
"loss": 1.2151,
|
| 19136 |
+
"step": 27210
|
| 19137 |
+
},
|
| 19138 |
+
{
|
| 19139 |
+
"epoch": 0.68,
|
| 19140 |
+
"grad_norm": 9.82458209991455,
|
| 19141 |
+
"learning_rate": 1.8847457627118646e-06,
|
| 19142 |
+
"loss": 1.393,
|
| 19143 |
+
"step": 27220
|
| 19144 |
+
},
|
| 19145 |
+
{
|
| 19146 |
+
"epoch": 0.68,
|
| 19147 |
+
"grad_norm": 16.87822914123535,
|
| 19148 |
+
"learning_rate": 1.8779661016949156e-06,
|
| 19149 |
+
"loss": 1.269,
|
| 19150 |
+
"step": 27230
|
| 19151 |
+
},
|
| 19152 |
+
{
|
| 19153 |
+
"epoch": 0.68,
|
| 19154 |
+
"grad_norm": 6.815838813781738,
|
| 19155 |
+
"learning_rate": 1.8711864406779661e-06,
|
| 19156 |
+
"loss": 1.4237,
|
| 19157 |
+
"step": 27240
|
| 19158 |
+
},
|
| 19159 |
+
{
|
| 19160 |
+
"epoch": 0.68,
|
| 19161 |
+
"grad_norm": 3.9835774898529053,
|
| 19162 |
+
"learning_rate": 1.8644067796610171e-06,
|
| 19163 |
+
"loss": 1.2872,
|
| 19164 |
+
"step": 27250
|
| 19165 |
+
},
|
| 19166 |
+
{
|
| 19167 |
+
"epoch": 0.68,
|
| 19168 |
+
"grad_norm": 4.0309953689575195,
|
| 19169 |
+
"learning_rate": 1.857627118644068e-06,
|
| 19170 |
+
"loss": 1.2629,
|
| 19171 |
+
"step": 27260
|
| 19172 |
+
},
|
| 19173 |
+
{
|
| 19174 |
+
"epoch": 0.68,
|
| 19175 |
+
"grad_norm": 12.035406112670898,
|
| 19176 |
+
"learning_rate": 1.8508474576271189e-06,
|
| 19177 |
+
"loss": 1.3755,
|
| 19178 |
+
"step": 27270
|
| 19179 |
+
},
|
| 19180 |
+
{
|
| 19181 |
+
"epoch": 0.68,
|
| 19182 |
+
"grad_norm": 1.8347936868667603,
|
| 19183 |
+
"learning_rate": 1.8440677966101696e-06,
|
| 19184 |
+
"loss": 1.3831,
|
| 19185 |
+
"step": 27280
|
| 19186 |
+
},
|
| 19187 |
+
{
|
| 19188 |
+
"epoch": 0.68,
|
| 19189 |
+
"grad_norm": 8.658760070800781,
|
| 19190 |
+
"learning_rate": 1.8372881355932204e-06,
|
| 19191 |
+
"loss": 1.3428,
|
| 19192 |
+
"step": 27290
|
| 19193 |
+
},
|
| 19194 |
+
{
|
| 19195 |
+
"epoch": 0.68,
|
| 19196 |
+
"grad_norm": 3.145319938659668,
|
| 19197 |
+
"learning_rate": 1.8305084745762714e-06,
|
| 19198 |
+
"loss": 1.4272,
|
| 19199 |
+
"step": 27300
|
| 19200 |
+
},
|
| 19201 |
+
{
|
| 19202 |
+
"epoch": 0.68,
|
| 19203 |
+
"grad_norm": 9.265095710754395,
|
| 19204 |
+
"learning_rate": 1.8237288135593223e-06,
|
| 19205 |
+
"loss": 1.4133,
|
| 19206 |
+
"step": 27310
|
| 19207 |
+
},
|
| 19208 |
+
{
|
| 19209 |
+
"epoch": 0.68,
|
| 19210 |
+
"grad_norm": 7.101969242095947,
|
| 19211 |
+
"learning_rate": 1.816949152542373e-06,
|
| 19212 |
+
"loss": 1.1041,
|
| 19213 |
+
"step": 27320
|
| 19214 |
+
},
|
| 19215 |
+
{
|
| 19216 |
+
"epoch": 0.68,
|
| 19217 |
+
"grad_norm": 16.614511489868164,
|
| 19218 |
+
"learning_rate": 1.8101694915254239e-06,
|
| 19219 |
+
"loss": 1.3473,
|
| 19220 |
+
"step": 27330
|
| 19221 |
+
},
|
| 19222 |
+
{
|
| 19223 |
+
"epoch": 0.68,
|
| 19224 |
+
"grad_norm": 2.301051378250122,
|
| 19225 |
+
"learning_rate": 1.8033898305084746e-06,
|
| 19226 |
+
"loss": 1.2432,
|
| 19227 |
+
"step": 27340
|
| 19228 |
+
},
|
| 19229 |
+
{
|
| 19230 |
+
"epoch": 0.68,
|
| 19231 |
+
"grad_norm": 5.643409729003906,
|
| 19232 |
+
"learning_rate": 1.7966101694915256e-06,
|
| 19233 |
+
"loss": 1.2781,
|
| 19234 |
+
"step": 27350
|
| 19235 |
+
},
|
| 19236 |
+
{
|
| 19237 |
+
"epoch": 0.68,
|
| 19238 |
+
"grad_norm": 7.286752223968506,
|
| 19239 |
+
"learning_rate": 1.7898305084745766e-06,
|
| 19240 |
+
"loss": 1.328,
|
| 19241 |
+
"step": 27360
|
| 19242 |
+
},
|
| 19243 |
+
{
|
| 19244 |
+
"epoch": 0.68,
|
| 19245 |
+
"grad_norm": 3.3953471183776855,
|
| 19246 |
+
"learning_rate": 1.7830508474576271e-06,
|
| 19247 |
+
"loss": 1.3025,
|
| 19248 |
+
"step": 27370
|
| 19249 |
+
},
|
| 19250 |
+
{
|
| 19251 |
+
"epoch": 0.68,
|
| 19252 |
+
"grad_norm": 8.240042686462402,
|
| 19253 |
+
"learning_rate": 1.7762711864406781e-06,
|
| 19254 |
+
"loss": 1.3847,
|
| 19255 |
+
"step": 27380
|
| 19256 |
+
},
|
| 19257 |
+
{
|
| 19258 |
+
"epoch": 0.68,
|
| 19259 |
+
"grad_norm": 9.705995559692383,
|
| 19260 |
+
"learning_rate": 1.769491525423729e-06,
|
| 19261 |
+
"loss": 1.427,
|
| 19262 |
+
"step": 27390
|
| 19263 |
+
},
|
| 19264 |
+
{
|
| 19265 |
+
"epoch": 0.69,
|
| 19266 |
+
"grad_norm": 3.306814193725586,
|
| 19267 |
+
"learning_rate": 1.7627118644067799e-06,
|
| 19268 |
+
"loss": 1.4487,
|
| 19269 |
+
"step": 27400
|
| 19270 |
+
},
|
| 19271 |
+
{
|
| 19272 |
+
"epoch": 0.69,
|
| 19273 |
+
"grad_norm": 15.25204086303711,
|
| 19274 |
+
"learning_rate": 1.7559322033898306e-06,
|
| 19275 |
+
"loss": 1.1448,
|
| 19276 |
+
"step": 27410
|
| 19277 |
+
},
|
| 19278 |
+
{
|
| 19279 |
+
"epoch": 0.69,
|
| 19280 |
+
"grad_norm": 9.065521240234375,
|
| 19281 |
+
"learning_rate": 1.7491525423728814e-06,
|
| 19282 |
+
"loss": 1.4139,
|
| 19283 |
+
"step": 27420
|
| 19284 |
+
},
|
| 19285 |
+
{
|
| 19286 |
+
"epoch": 0.69,
|
| 19287 |
+
"grad_norm": 7.884547233581543,
|
| 19288 |
+
"learning_rate": 1.7423728813559324e-06,
|
| 19289 |
+
"loss": 1.3313,
|
| 19290 |
+
"step": 27430
|
| 19291 |
+
},
|
| 19292 |
+
{
|
| 19293 |
+
"epoch": 0.69,
|
| 19294 |
+
"grad_norm": 8.109780311584473,
|
| 19295 |
+
"learning_rate": 1.7355932203389834e-06,
|
| 19296 |
+
"loss": 1.2523,
|
| 19297 |
+
"step": 27440
|
| 19298 |
+
},
|
| 19299 |
+
{
|
| 19300 |
+
"epoch": 0.69,
|
| 19301 |
+
"grad_norm": 23.829362869262695,
|
| 19302 |
+
"learning_rate": 1.728813559322034e-06,
|
| 19303 |
+
"loss": 1.2634,
|
| 19304 |
+
"step": 27450
|
| 19305 |
+
},
|
| 19306 |
+
{
|
| 19307 |
+
"epoch": 0.69,
|
| 19308 |
+
"grad_norm": 8.787532806396484,
|
| 19309 |
+
"learning_rate": 1.7220338983050849e-06,
|
| 19310 |
+
"loss": 1.3913,
|
| 19311 |
+
"step": 27460
|
| 19312 |
+
},
|
| 19313 |
+
{
|
| 19314 |
+
"epoch": 0.69,
|
| 19315 |
+
"grad_norm": 17.018415451049805,
|
| 19316 |
+
"learning_rate": 1.7152542372881356e-06,
|
| 19317 |
+
"loss": 1.2223,
|
| 19318 |
+
"step": 27470
|
| 19319 |
+
},
|
| 19320 |
+
{
|
| 19321 |
+
"epoch": 0.69,
|
| 19322 |
+
"grad_norm": 9.56651782989502,
|
| 19323 |
+
"learning_rate": 1.7084745762711866e-06,
|
| 19324 |
+
"loss": 1.3977,
|
| 19325 |
+
"step": 27480
|
| 19326 |
+
},
|
| 19327 |
+
{
|
| 19328 |
+
"epoch": 0.69,
|
| 19329 |
+
"grad_norm": 4.520813941955566,
|
| 19330 |
+
"learning_rate": 1.7016949152542376e-06,
|
| 19331 |
+
"loss": 1.3356,
|
| 19332 |
+
"step": 27490
|
| 19333 |
+
},
|
| 19334 |
+
{
|
| 19335 |
+
"epoch": 0.69,
|
| 19336 |
+
"grad_norm": 5.794963359832764,
|
| 19337 |
+
"learning_rate": 1.6949152542372882e-06,
|
| 19338 |
+
"loss": 1.2999,
|
| 19339 |
+
"step": 27500
|
| 19340 |
+
},
|
| 19341 |
+
{
|
| 19342 |
+
"epoch": 0.69,
|
| 19343 |
+
"eval_loss": 1.2965120077133179,
|
| 19344 |
+
"eval_runtime": 122.3839,
|
| 19345 |
+
"eval_samples_per_second": 8.171,
|
| 19346 |
+
"eval_steps_per_second": 8.171,
|
| 19347 |
+
"step": 27500
|
| 19348 |
}
|
| 19349 |
],
|
| 19350 |
"logging_steps": 10,
|
|
|
|
| 19352 |
"num_input_tokens_seen": 0,
|
| 19353 |
"num_train_epochs": 1,
|
| 19354 |
"save_steps": 2500,
|
| 19355 |
+
"total_flos": 4.4280846483456e+17,
|
| 19356 |
"train_batch_size": 1,
|
| 19357 |
"trial_name": null,
|
| 19358 |
"trial_params": null
|