Training in progress, step 600, checkpoint
Browse files
    	
        last-checkpoint/adapter_model.safetensors
    CHANGED
    
    | @@ -1,3 +1,3 @@ | |
| 1 | 
             
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            -
            oid sha256: | 
| 3 | 
             
            size 349243752
         | 
|  | |
| 1 | 
             
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:98488c56f318dc2da3929b18ad2ad5a152e66efba66afbe4fc87cda337b6db57
         | 
| 3 | 
             
            size 349243752
         | 
    	
        last-checkpoint/optimizer.pt
    CHANGED
    
    | @@ -1,3 +1,3 @@ | |
| 1 | 
             
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            -
            oid sha256: | 
| 3 | 
             
            size 177909253
         | 
|  | |
| 1 | 
             
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:057c770e031ada072d6f11d80e9d3ef37634519804b1e3934feba9bcc0ac546d
         | 
| 3 | 
             
            size 177909253
         | 
    	
        last-checkpoint/rng_state.pth
    CHANGED
    
    | @@ -1,3 +1,3 @@ | |
| 1 | 
             
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            -
            oid sha256: | 
| 3 | 
             
            size 14645
         | 
|  | |
| 1 | 
             
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:442b031d01f716fa595ec83da7a5b8b396b18c106796b82715fedbff217e57d5
         | 
| 3 | 
             
            size 14645
         | 
    	
        last-checkpoint/scheduler.pt
    CHANGED
    
    | @@ -1,3 +1,3 @@ | |
| 1 | 
             
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            -
            oid sha256: | 
| 3 | 
             
            size 1465
         | 
|  | |
| 1 | 
             
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:24481c486beb14ce7d59d0586b23c806ec848e00bc91bbf21c23488cf27d188d
         | 
| 3 | 
             
            size 1465
         | 
    	
        last-checkpoint/trainer_state.json
    CHANGED
    
    | @@ -2,9 +2,9 @@ | |
| 2 | 
             
              "best_global_step": null,
         | 
| 3 | 
             
              "best_metric": null,
         | 
| 4 | 
             
              "best_model_checkpoint": null,
         | 
| 5 | 
            -
              "epoch": 0. | 
| 6 | 
             
              "eval_steps": 500,
         | 
| 7 | 
            -
              "global_step":  | 
| 8 | 
             
              "is_hyper_param_search": false,
         | 
| 9 | 
             
              "is_local_process_zero": true,
         | 
| 10 | 
             
              "is_world_process_zero": true,
         | 
| @@ -883,6 +883,181 @@ | |
| 883 | 
             
                  "learning_rate": 8.012803577096473e-06,
         | 
| 884 | 
             
                  "loss": 1.3037,
         | 
| 885 | 
             
                  "step": 500
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 886 | 
             
                }
         | 
| 887 | 
             
              ],
         | 
| 888 | 
             
              "logging_steps": 4,
         | 
| @@ -902,7 +1077,7 @@ | |
| 902 | 
             
                  "attributes": {}
         | 
| 903 | 
             
                }
         | 
| 904 | 
             
              },
         | 
| 905 | 
            -
              "total_flos":  | 
| 906 | 
             
              "train_batch_size": 24,
         | 
| 907 | 
             
              "trial_name": null,
         | 
| 908 | 
             
              "trial_params": null
         | 
|  | |
| 2 | 
             
              "best_global_step": null,
         | 
| 3 | 
             
              "best_metric": null,
         | 
| 4 | 
             
              "best_model_checkpoint": null,
         | 
| 5 | 
            +
              "epoch": 0.5970149253731343,
         | 
| 6 | 
             
              "eval_steps": 500,
         | 
| 7 | 
            +
              "global_step": 600,
         | 
| 8 | 
             
              "is_hyper_param_search": false,
         | 
| 9 | 
             
              "is_local_process_zero": true,
         | 
| 10 | 
             
              "is_world_process_zero": true,
         | 
|  | |
| 883 | 
             
                  "learning_rate": 8.012803577096473e-06,
         | 
| 884 | 
             
                  "loss": 1.3037,
         | 
| 885 | 
             
                  "step": 500
         | 
| 886 | 
            +
                },
         | 
| 887 | 
            +
                {
         | 
| 888 | 
            +
                  "epoch": 0.5014925373134328,
         | 
| 889 | 
            +
                  "grad_norm": 0.17158161103725433,
         | 
| 890 | 
            +
                  "learning_rate": 7.92168888781252e-06,
         | 
| 891 | 
            +
                  "loss": 1.409,
         | 
| 892 | 
            +
                  "step": 504
         | 
| 893 | 
            +
                },
         | 
| 894 | 
            +
                {
         | 
| 895 | 
            +
                  "epoch": 0.5054726368159204,
         | 
| 896 | 
            +
                  "grad_norm": 0.1840897649526596,
         | 
| 897 | 
            +
                  "learning_rate": 7.830051785587235e-06,
         | 
| 898 | 
            +
                  "loss": 1.3857,
         | 
| 899 | 
            +
                  "step": 508
         | 
| 900 | 
            +
                },
         | 
| 901 | 
            +
                {
         | 
| 902 | 
            +
                  "epoch": 0.5094527363184079,
         | 
| 903 | 
            +
                  "grad_norm": 0.16382519900798798,
         | 
| 904 | 
            +
                  "learning_rate": 7.737916550320155e-06,
         | 
| 905 | 
            +
                  "loss": 1.3431,
         | 
| 906 | 
            +
                  "step": 512
         | 
| 907 | 
            +
                },
         | 
| 908 | 
            +
                {
         | 
| 909 | 
            +
                  "epoch": 0.5134328358208955,
         | 
| 910 | 
            +
                  "grad_norm": 0.12751099467277527,
         | 
| 911 | 
            +
                  "learning_rate": 7.64530759389469e-06,
         | 
| 912 | 
            +
                  "loss": 1.3626,
         | 
| 913 | 
            +
                  "step": 516
         | 
| 914 | 
            +
                },
         | 
| 915 | 
            +
                {
         | 
| 916 | 
            +
                  "epoch": 0.5174129353233831,
         | 
| 917 | 
            +
                  "grad_norm": 0.16248376667499542,
         | 
| 918 | 
            +
                  "learning_rate": 7.552249453710032e-06,
         | 
| 919 | 
            +
                  "loss": 1.3129,
         | 
| 920 | 
            +
                  "step": 520
         | 
| 921 | 
            +
                },
         | 
| 922 | 
            +
                {
         | 
| 923 | 
            +
                  "epoch": 0.5213930348258706,
         | 
| 924 | 
            +
                  "grad_norm": 0.1406685709953308,
         | 
| 925 | 
            +
                  "learning_rate": 7.458766786179792e-06,
         | 
| 926 | 
            +
                  "loss": 1.3628,
         | 
| 927 | 
            +
                  "step": 524
         | 
| 928 | 
            +
                },
         | 
| 929 | 
            +
                {
         | 
| 930 | 
            +
                  "epoch": 0.5253731343283582,
         | 
| 931 | 
            +
                  "grad_norm": 0.13998349010944366,
         | 
| 932 | 
            +
                  "learning_rate": 7.364884360199107e-06,
         | 
| 933 | 
            +
                  "loss": 1.3887,
         | 
| 934 | 
            +
                  "step": 528
         | 
| 935 | 
            +
                },
         | 
| 936 | 
            +
                {
         | 
| 937 | 
            +
                  "epoch": 0.5293532338308458,
         | 
| 938 | 
            +
                  "grad_norm": 0.15993693470954895,
         | 
| 939 | 
            +
                  "learning_rate": 7.270627050581951e-06,
         | 
| 940 | 
            +
                  "loss": 1.3764,
         | 
| 941 | 
            +
                  "step": 532
         | 
| 942 | 
            +
                },
         | 
| 943 | 
            +
                {
         | 
| 944 | 
            +
                  "epoch": 0.5333333333333333,
         | 
| 945 | 
            +
                  "grad_norm": 0.21970954537391663,
         | 
| 946 | 
            +
                  "learning_rate": 7.176019831470373e-06,
         | 
| 947 | 
            +
                  "loss": 1.4067,
         | 
| 948 | 
            +
                  "step": 536
         | 
| 949 | 
            +
                },
         | 
| 950 | 
            +
                {
         | 
| 951 | 
            +
                  "epoch": 0.5373134328358209,
         | 
| 952 | 
            +
                  "grad_norm": 0.1592174619436264,
         | 
| 953 | 
            +
                  "learning_rate": 7.081087769717416e-06,
         | 
| 954 | 
            +
                  "loss": 1.4348,
         | 
| 955 | 
            +
                  "step": 540
         | 
| 956 | 
            +
                },
         | 
| 957 | 
            +
                {
         | 
| 958 | 
            +
                  "epoch": 0.5412935323383085,
         | 
| 959 | 
            +
                  "grad_norm": 0.1640401929616928,
         | 
| 960 | 
            +
                  "learning_rate": 6.985856018245494e-06,
         | 
| 961 | 
            +
                  "loss": 1.39,
         | 
| 962 | 
            +
                  "step": 544
         | 
| 963 | 
            +
                },
         | 
| 964 | 
            +
                {
         | 
| 965 | 
            +
                  "epoch": 0.545273631840796,
         | 
| 966 | 
            +
                  "grad_norm": 0.14530886709690094,
         | 
| 967 | 
            +
                  "learning_rate": 6.890349809381926e-06,
         | 
| 968 | 
            +
                  "loss": 1.4217,
         | 
| 969 | 
            +
                  "step": 548
         | 
| 970 | 
            +
                },
         | 
| 971 | 
            +
                {
         | 
| 972 | 
            +
                  "epoch": 0.5492537313432836,
         | 
| 973 | 
            +
                  "grad_norm": 0.14905086159706116,
         | 
| 974 | 
            +
                  "learning_rate": 6.7945944481734625e-06,
         | 
| 975 | 
            +
                  "loss": 1.3693,
         | 
| 976 | 
            +
                  "step": 552
         | 
| 977 | 
            +
                },
         | 
| 978 | 
            +
                {
         | 
| 979 | 
            +
                  "epoch": 0.5532338308457712,
         | 
| 980 | 
            +
                  "grad_norm": 0.1508338302373886,
         | 
| 981 | 
            +
                  "learning_rate": 6.698615305681538e-06,
         | 
| 982 | 
            +
                  "loss": 1.3794,
         | 
| 983 | 
            +
                  "step": 556
         | 
| 984 | 
            +
                },
         | 
| 985 | 
            +
                {
         | 
| 986 | 
            +
                  "epoch": 0.5572139303482587,
         | 
| 987 | 
            +
                  "grad_norm": 0.15846911072731018,
         | 
| 988 | 
            +
                  "learning_rate": 6.602437812260021e-06,
         | 
| 989 | 
            +
                  "loss": 1.439,
         | 
| 990 | 
            +
                  "step": 560
         | 
| 991 | 
            +
                },
         | 
| 992 | 
            +
                {
         | 
| 993 | 
            +
                  "epoch": 0.5611940298507463,
         | 
| 994 | 
            +
                  "grad_norm": 0.15680456161499023,
         | 
| 995 | 
            +
                  "learning_rate": 6.5060874508172626e-06,
         | 
| 996 | 
            +
                  "loss": 1.3706,
         | 
| 997 | 
            +
                  "step": 564
         | 
| 998 | 
            +
                },
         | 
| 999 | 
            +
                {
         | 
| 1000 | 
            +
                  "epoch": 0.5651741293532339,
         | 
| 1001 | 
            +
                  "grad_norm": 0.14353099465370178,
         | 
| 1002 | 
            +
                  "learning_rate": 6.4095897500642245e-06,
         | 
| 1003 | 
            +
                  "loss": 1.4015,
         | 
| 1004 | 
            +
                  "step": 568
         | 
| 1005 | 
            +
                },
         | 
| 1006 | 
            +
                {
         | 
| 1007 | 
            +
                  "epoch": 0.5691542288557214,
         | 
| 1008 | 
            +
                  "grad_norm": 0.16101489961147308,
         | 
| 1009 | 
            +
                  "learning_rate": 6.3129702777504585e-06,
         | 
| 1010 | 
            +
                  "loss": 1.3364,
         | 
| 1011 | 
            +
                  "step": 572
         | 
| 1012 | 
            +
                },
         | 
| 1013 | 
            +
                {
         | 
| 1014 | 
            +
                  "epoch": 0.573134328358209,
         | 
| 1015 | 
            +
                  "grad_norm": 0.13535454869270325,
         | 
| 1016 | 
            +
                  "learning_rate": 6.216254633889758e-06,
         | 
| 1017 | 
            +
                  "loss": 1.3294,
         | 
| 1018 | 
            +
                  "step": 576
         | 
| 1019 | 
            +
                },
         | 
| 1020 | 
            +
                {
         | 
| 1021 | 
            +
                  "epoch": 0.5771144278606966,
         | 
| 1022 | 
            +
                  "grad_norm": 0.17043928802013397,
         | 
| 1023 | 
            +
                  "learning_rate": 6.119468443977249e-06,
         | 
| 1024 | 
            +
                  "loss": 1.4216,
         | 
| 1025 | 
            +
                  "step": 580
         | 
| 1026 | 
            +
                },
         | 
| 1027 | 
            +
                {
         | 
| 1028 | 
            +
                  "epoch": 0.5810945273631841,
         | 
| 1029 | 
            +
                  "grad_norm": 0.15072950720787048,
         | 
| 1030 | 
            +
                  "learning_rate": 6.02263735219973e-06,
         | 
| 1031 | 
            +
                  "loss": 1.4152,
         | 
| 1032 | 
            +
                  "step": 584
         | 
| 1033 | 
            +
                },
         | 
| 1034 | 
            +
                {
         | 
| 1035 | 
            +
                  "epoch": 0.5850746268656717,
         | 
| 1036 | 
            +
                  "grad_norm": 0.13807035982608795,
         | 
| 1037 | 
            +
                  "learning_rate": 5.925787014641067e-06,
         | 
| 1038 | 
            +
                  "loss": 1.369,
         | 
| 1039 | 
            +
                  "step": 588
         | 
| 1040 | 
            +
                },
         | 
| 1041 | 
            +
                {
         | 
| 1042 | 
            +
                  "epoch": 0.5890547263681593,
         | 
| 1043 | 
            +
                  "grad_norm": 0.1548323780298233,
         | 
| 1044 | 
            +
                  "learning_rate": 5.82894309248444e-06,
         | 
| 1045 | 
            +
                  "loss": 1.4166,
         | 
| 1046 | 
            +
                  "step": 592
         | 
| 1047 | 
            +
                },
         | 
| 1048 | 
            +
                {
         | 
| 1049 | 
            +
                  "epoch": 0.5930348258706468,
         | 
| 1050 | 
            +
                  "grad_norm": 0.16310498118400574,
         | 
| 1051 | 
            +
                  "learning_rate": 5.732131245213214e-06,
         | 
| 1052 | 
            +
                  "loss": 1.3644,
         | 
| 1053 | 
            +
                  "step": 596
         | 
| 1054 | 
            +
                },
         | 
| 1055 | 
            +
                {
         | 
| 1056 | 
            +
                  "epoch": 0.5970149253731343,
         | 
| 1057 | 
            +
                  "grad_norm": 0.14257760345935822,
         | 
| 1058 | 
            +
                  "learning_rate": 5.63537712381229e-06,
         | 
| 1059 | 
            +
                  "loss": 1.3559,
         | 
| 1060 | 
            +
                  "step": 600
         | 
| 1061 | 
             
                }
         | 
| 1062 | 
             
              ],
         | 
| 1063 | 
             
              "logging_steps": 4,
         | 
|  | |
| 1077 | 
             
                  "attributes": {}
         | 
| 1078 | 
             
                }
         | 
| 1079 | 
             
              },
         | 
| 1080 | 
            +
              "total_flos": 4.52163003620524e+17,
         | 
| 1081 | 
             
              "train_batch_size": 24,
         | 
| 1082 | 
             
              "trial_name": null,
         | 
| 1083 | 
             
              "trial_params": null
         | 
