astardusta commited on
Commit
e26bf72
·
verified ·
1 Parent(s): b5dc464

End of training

Browse files
README.md CHANGED
@@ -37,9 +37,9 @@ This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing
37
 
38
  ### Framework versions
39
 
40
- - TRL: 0.18.1
41
  - Transformers: 4.52.4
42
- - Pytorch: 2.7.0
43
  - Datasets: 3.6.0
44
  - Tokenizers: 0.21.1
45
 
 
37
 
38
  ### Framework versions
39
 
40
+ - TRL: 0.18.2
41
  - Transformers: 4.52.4
42
+ - Pytorch: 2.7.1
43
  - Datasets: 3.6.0
44
  - Tokenizers: 0.21.1
45
 
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.13003649711608886,
4
- "train_runtime": 4843.1597,
5
- "train_samples": 33,
6
- "train_samples_per_second": 0.004,
7
- "train_steps_per_second": 0.001
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 8.335709571838379e-06,
4
+ "train_runtime": 2249.7087,
5
+ "train_samples": 2,
6
+ "train_samples_per_second": 0.009,
7
+ "train_steps_per_second": 0.002
8
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:29162337a7219006047e083f7c12d6a9388c43c8dd4348cab92422efb60f915b
3
  size 1976163472
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd2f90d31f60841357e40a7d80a8590c8439aab55aea403847ffe115bba02aa8
3
  size 1976163472
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.13003649711608886,
4
- "train_runtime": 4843.1597,
5
- "train_samples": 33,
6
- "train_samples_per_second": 0.004,
7
- "train_steps_per_second": 0.001
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 8.335709571838379e-06,
4
+ "train_runtime": 2249.7087,
5
+ "train_samples": 2,
6
+ "train_samples_per_second": 0.009,
7
+ "train_steps_per_second": 0.002
8
  }
trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.30303030303030304,
6
  "eval_steps": 500,
7
  "global_step": 5,
8
  "is_hyper_param_search": false,
@@ -16,31 +16,31 @@
16
  "clip_ratio/low_min": 0.0,
17
  "clip_ratio/region_mean": 0.0,
18
  "completions/clipped_ratio": 0.0,
19
- "completions/max_length": 573.5,
20
- "completions/max_terminated_length": 573.5,
21
- "completions/mean_length": 274.25,
22
- "completions/mean_terminated_length": 274.25,
23
- "completions/min_length": 71.0,
24
- "completions/min_terminated_length": 71.0,
25
- "epoch": 0.12121212121212122,
26
  "frac_reward_zero_std": 0.25,
27
- "grad_norm": 7.713528156280518,
28
  "kl": 0.0,
29
  "learning_rate": 5e-07,
30
- "loss": 0.131,
31
- "num_tokens": 4242.0,
32
- "reward": 0.08580746594816446,
33
- "reward_std": 0.0030113481334410608,
34
- "rewards/concensus_correctness_reward_func/mean": 0.0,
35
- "rewards/concensus_correctness_reward_func/std": 0.0,
36
  "rewards/consensus_reward_func/mean": 0.0,
37
  "rewards/consensus_reward_func/std": 0.0,
38
  "rewards/cumulative_reward_2/mean": 0.0,
39
  "rewards/cumulative_reward_2/std": 0.0,
40
  "rewards/final_correctness_reward_func/mean": 0.0,
41
  "rewards/final_correctness_reward_func/std": 0.0,
42
- "rewards/question_recreation_reward_func/mean": 0.08580746594816446,
43
- "rewards/question_recreation_reward_func/std": 0.01734682370442897,
44
  "rewards/soft_format_reward_func/mean": 0.0,
45
  "rewards/soft_format_reward_func/std": 0.0,
46
  "rewards/strict_format_reward_func/mean": 0.0,
@@ -55,32 +55,32 @@
55
  "clip_ratio/low_mean": 0.0,
56
  "clip_ratio/low_min": 0.0,
57
  "clip_ratio/region_mean": 0.0,
58
- "completions/clipped_ratio": 0.125,
59
- "completions/max_length": 795.5,
60
- "completions/max_terminated_length": 684.5,
61
- "completions/mean_length": 331.875,
62
- "completions/mean_terminated_length": 241.0,
63
- "completions/min_length": 22.5,
64
- "completions/min_terminated_length": 22.5,
65
- "epoch": 0.24242424242424243,
66
- "frac_reward_zero_std": 0.25,
67
- "grad_norm": 6.215135097503662,
68
- "kl": 0.0003396936699573416,
69
  "learning_rate": 2.5e-07,
70
- "loss": 0.0075,
71
- "num_tokens": 8945.0,
72
- "reward": 0.016037299297749996,
73
- "reward_std": 0.007742004003375769,
74
- "rewards/concensus_correctness_reward_func/mean": 0.0,
75
- "rewards/concensus_correctness_reward_func/std": 0.0,
76
  "rewards/consensus_reward_func/mean": 0.0,
77
  "rewards/consensus_reward_func/std": 0.0,
78
  "rewards/cumulative_reward_2/mean": 0.0,
79
  "rewards/cumulative_reward_2/std": 0.0,
80
  "rewards/final_correctness_reward_func/mean": 0.0,
81
  "rewards/final_correctness_reward_func/std": 0.0,
82
- "rewards/question_recreation_reward_func/mean": 0.016037299297749996,
83
- "rewards/question_recreation_reward_func/std": 0.016464148182421923,
84
  "rewards/soft_format_reward_func/mean": 0.0,
85
  "rewards/soft_format_reward_func/std": 0.0,
86
  "rewards/strict_format_reward_func/mean": 0.0,
@@ -95,29 +95,29 @@
95
  "clip_ratio/low_mean": 0.0,
96
  "clip_ratio/low_min": 0.0,
97
  "clip_ratio/region_mean": 0.0,
98
- "completions/clipped_ratio": 0.0,
99
- "completions/max_length": 348.0,
100
- "completions/max_terminated_length": 348.0,
101
- "completions/mean_length": 220.0,
102
- "completions/mean_terminated_length": 220.0,
103
- "completions/min_length": 89.0,
104
- "completions/min_terminated_length": 89.0,
105
- "epoch": 0.30303030303030304,
106
  "frac_reward_zero_std": 0.0,
107
- "kl": 0.0014414309989660978,
108
- "num_tokens": 10849.0,
109
- "reward": 0.009446130134165287,
110
- "reward_std": 0.005829450208693743,
111
- "rewards/concensus_correctness_reward_func/mean": 0.0,
112
- "rewards/concensus_correctness_reward_func/std": 0.0,
113
- "rewards/consensus_reward_func/mean": 0.0,
114
- "rewards/consensus_reward_func/std": 0.0,
115
  "rewards/cumulative_reward_2/mean": 0.0,
116
  "rewards/cumulative_reward_2/std": 0.0,
117
  "rewards/final_correctness_reward_func/mean": 0.0,
118
  "rewards/final_correctness_reward_func/std": 0.0,
119
- "rewards/question_recreation_reward_func/mean": 0.009446130134165287,
120
- "rewards/question_recreation_reward_func/std": 0.00640238169580698,
121
  "rewards/soft_format_reward_func/mean": 0.0,
122
  "rewards/soft_format_reward_func/std": 0.0,
123
  "rewards/strict_format_reward_func/mean": 0.0,
@@ -126,16 +126,16 @@
126
  "rewards/xmlcount_reward_func/std": 0.0,
127
  "step": 5,
128
  "total_flos": 0.0,
129
- "train_loss": 0.13003649711608886,
130
- "train_runtime": 4843.1597,
131
- "train_samples_per_second": 0.004,
132
- "train_steps_per_second": 0.001
133
  }
134
  ],
135
  "logging_steps": 2,
136
  "max_steps": 5,
137
- "num_input_tokens_seen": 10849,
138
- "num_train_epochs": 1,
139
  "save_steps": 25,
140
  "stateful_callbacks": {
141
  "TrainerControl": {
@@ -150,7 +150,7 @@
150
  }
151
  },
152
  "total_flos": 0.0,
153
- "train_batch_size": 2,
154
  "trial_name": null,
155
  "trial_params": null
156
  }
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 5.0,
6
  "eval_steps": 500,
7
  "global_step": 5,
8
  "is_hyper_param_search": false,
 
16
  "clip_ratio/low_min": 0.0,
17
  "clip_ratio/region_mean": 0.0,
18
  "completions/clipped_ratio": 0.0,
19
+ "completions/max_length": 214.5,
20
+ "completions/max_terminated_length": 214.5,
21
+ "completions/mean_length": 100.0,
22
+ "completions/mean_terminated_length": 100.0,
23
+ "completions/min_length": 4.0,
24
+ "completions/min_terminated_length": 4.0,
25
+ "epoch": 2.0,
26
  "frac_reward_zero_std": 0.25,
27
+ "grad_norm": 62.43251037597656,
28
  "kl": 0.0,
29
  "learning_rate": 5e-07,
30
+ "loss": -0.0,
31
+ "num_tokens": 2848.0,
32
+ "reward": 0.39153189957141876,
33
+ "reward_std": 0.17964571295306087,
34
+ "rewards/concensus_correctness_reward_func/mean": 0.375,
35
+ "rewards/concensus_correctness_reward_func/std": 0.5386751294136047,
36
  "rewards/consensus_reward_func/mean": 0.0,
37
  "rewards/consensus_reward_func/std": 0.0,
38
  "rewards/cumulative_reward_2/mean": 0.0,
39
  "rewards/cumulative_reward_2/std": 0.0,
40
  "rewards/final_correctness_reward_func/mean": 0.0,
41
  "rewards/final_correctness_reward_func/std": 0.0,
42
+ "rewards/question_recreation_reward_func/mean": 0.016531903762370348,
43
+ "rewards/question_recreation_reward_func/std": 0.00948757166042924,
44
  "rewards/soft_format_reward_func/mean": 0.0,
45
  "rewards/soft_format_reward_func/std": 0.0,
46
  "rewards/strict_format_reward_func/mean": 0.0,
 
55
  "clip_ratio/low_mean": 0.0,
56
  "clip_ratio/low_min": 0.0,
57
  "clip_ratio/region_mean": 0.0,
58
+ "completions/clipped_ratio": 0.0,
59
+ "completions/max_length": 188.0,
60
+ "completions/max_terminated_length": 188.0,
61
+ "completions/mean_length": 78.375,
62
+ "completions/mean_terminated_length": 78.375,
63
+ "completions/min_length": 10.5,
64
+ "completions/min_terminated_length": 10.5,
65
+ "epoch": 4.0,
66
+ "frac_reward_zero_std": 0.0,
67
+ "grad_norm": 25.1249942779541,
68
+ "kl": 0.006960342208913062,
69
  "learning_rate": 2.5e-07,
70
+ "loss": 0.0,
71
+ "num_tokens": 5523.0,
72
+ "reward": 0.14147359877824783,
73
+ "reward_std": 0.18529291450977325,
74
+ "rewards/concensus_correctness_reward_func/mean": 0.125,
75
+ "rewards/concensus_correctness_reward_func/std": 0.25,
76
  "rewards/consensus_reward_func/mean": 0.0,
77
  "rewards/consensus_reward_func/std": 0.0,
78
  "rewards/cumulative_reward_2/mean": 0.0,
79
  "rewards/cumulative_reward_2/std": 0.0,
80
  "rewards/final_correctness_reward_func/mean": 0.0,
81
  "rewards/final_correctness_reward_func/std": 0.0,
82
+ "rewards/question_recreation_reward_func/mean": 0.016473600640892982,
83
+ "rewards/question_recreation_reward_func/std": 0.012013186700642109,
84
  "rewards/soft_format_reward_func/mean": 0.0,
85
  "rewards/soft_format_reward_func/std": 0.0,
86
  "rewards/strict_format_reward_func/mean": 0.0,
 
95
  "clip_ratio/low_mean": 0.0,
96
  "clip_ratio/low_min": 0.0,
97
  "clip_ratio/region_mean": 0.0,
98
+ "completions/clipped_ratio": 0.25,
99
+ "completions/max_length": 1024.0,
100
+ "completions/max_terminated_length": 8.0,
101
+ "completions/mean_length": 260.0,
102
+ "completions/mean_terminated_length": 5.333333492279053,
103
+ "completions/min_length": 4.0,
104
+ "completions/min_terminated_length": 4.0,
105
+ "epoch": 5.0,
106
  "frac_reward_zero_std": 0.0,
107
+ "kl": 0.027669312112266198,
108
+ "num_tokens": 7587.0,
109
+ "reward": 0.7625923156738281,
110
+ "reward_std": 1.0524399280548096,
111
+ "rewards/concensus_correctness_reward_func/mean": 0.25,
112
+ "rewards/concensus_correctness_reward_func/std": 0.5,
113
+ "rewards/consensus_reward_func/mean": 0.5,
114
+ "rewards/consensus_reward_func/std": 1.0,
115
  "rewards/cumulative_reward_2/mean": 0.0,
116
  "rewards/cumulative_reward_2/std": 0.0,
117
  "rewards/final_correctness_reward_func/mean": 0.0,
118
  "rewards/final_correctness_reward_func/std": 0.0,
119
+ "rewards/question_recreation_reward_func/mean": 0.012592295184731483,
120
+ "rewards/question_recreation_reward_func/std": 0.009031646884977818,
121
  "rewards/soft_format_reward_func/mean": 0.0,
122
  "rewards/soft_format_reward_func/std": 0.0,
123
  "rewards/strict_format_reward_func/mean": 0.0,
 
126
  "rewards/xmlcount_reward_func/std": 0.0,
127
  "step": 5,
128
  "total_flos": 0.0,
129
+ "train_loss": 8.335709571838379e-06,
130
+ "train_runtime": 2249.7087,
131
+ "train_samples_per_second": 0.009,
132
+ "train_steps_per_second": 0.002
133
  }
134
  ],
135
  "logging_steps": 2,
136
  "max_steps": 5,
137
+ "num_input_tokens_seen": 7587,
138
+ "num_train_epochs": 5,
139
  "save_steps": 25,
140
  "stateful_callbacks": {
141
  "TrainerControl": {
 
150
  }
151
  },
152
  "total_flos": 0.0,
153
+ "train_batch_size": 1,
154
  "trial_name": null,
155
  "trial_params": null
156
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:37d7dd9760e2f0b9fc0ecb1bb7fe855a2ba96420c98f7b3fdad2eb40e8a0b8be
3
  size 6865
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6696c1da0590b9246244a1e4ddc0545787b5190ca665df5ee07fe4357d032fec
3
  size 6865