powervel commited on
Commit
027d652
·
verified ·
1 Parent(s): beb8331

Upload processor

Browse files
Files changed (3) hide show
  1. added_tokens.json +1 -30
  2. special_tokens_map.json +107 -203
  3. tokenizer_config.json +107 -261
added_tokens.json CHANGED
@@ -1605,34 +1605,5 @@
1605
  "<|vi|>": 50278,
1606
  "<|yi|>": 50335,
1607
  "<|yo|>": 50325,
1608
- "<|zh|>": 50260,
1609
- "अ अ": 51878,
1610
- "अं": 51893,
1611
- "अन्न": 51886,
1612
- "अम्": 51865,
1613
- "अम्म": 51872,
1614
- "अह": 51889,
1615
- "अहं": 51880,
1616
- "आँ": 51867,
1617
- "आं": 51891,
1618
- "आंह": 51882,
1619
- "उ": 51884,
1620
- "उम्म": 51888,
1621
- "उह": 51869,
1622
- "ओ": 51871,
1623
- "ओह": 51876,
1624
- "ह": 51877,
1625
- "ह ह": 51881,
1626
- "ह ह ह": 51885,
1627
- "ह ह ह ह": 51875,
1628
- "हम्म": 51890,
1629
- "हह": 51868,
1630
- "हहह": 51883,
1631
- "हां": 51866,
1632
- "हाहा": 51873,
1633
- "हुं हुं": 51892,
1634
- "हुंह": 51874,
1635
- "हुह": 51887,
1636
- "हुह्ह": 51870,
1637
- "हूं हूं": 51879
1638
  }
 
1605
  "<|vi|>": 50278,
1606
  "<|yi|>": 50335,
1607
  "<|yo|>": 50325,
1608
+ "<|zh|>": 50260
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1609
  }
special_tokens_map.json CHANGED
@@ -1,208 +1,112 @@
1
  {
2
  "additional_special_tokens": [
3
- {
4
- "content": "अम्",
5
- "lstrip": false,
6
- "normalized": false,
7
- "rstrip": false,
8
- "single_word": false
9
- },
10
- {
11
- "content": "हां",
12
- "lstrip": false,
13
- "normalized": false,
14
- "rstrip": false,
15
- "single_word": false
16
- },
17
- {
18
- "content": "आँ",
19
- "lstrip": false,
20
- "normalized": false,
21
- "rstrip": false,
22
- "single_word": false
23
- },
24
- {
25
- "content": "हह",
26
- "lstrip": false,
27
- "normalized": false,
28
- "rstrip": false,
29
- "single_word": false
30
- },
31
- {
32
- "content": "उह",
33
- "lstrip": false,
34
- "normalized": false,
35
- "rstrip": false,
36
- "single_word": false
37
- },
38
- {
39
- "content": "हुह्ह",
40
- "lstrip": false,
41
- "normalized": false,
42
- "rstrip": false,
43
- "single_word": false
44
- },
45
- {
46
- "content": "ओ",
47
- "lstrip": false,
48
- "normalized": false,
49
- "rstrip": false,
50
- "single_word": false
51
- },
52
- {
53
- "content": "अम्म",
54
- "lstrip": false,
55
- "normalized": false,
56
- "rstrip": false,
57
- "single_word": false
58
- },
59
- {
60
- "content": "हाहा",
61
- "lstrip": false,
62
- "normalized": false,
63
- "rstrip": false,
64
- "single_word": false
65
- },
66
- {
67
- "content": "हुंह",
68
- "lstrip": false,
69
- "normalized": false,
70
- "rstrip": false,
71
- "single_word": false
72
- },
73
- {
74
- "content": "ह ह ह ह",
75
- "lstrip": false,
76
- "normalized": false,
77
- "rstrip": false,
78
- "single_word": false
79
- },
80
- {
81
- "content": "ओह",
82
- "lstrip": false,
83
- "normalized": false,
84
- "rstrip": false,
85
- "single_word": false
86
- },
87
- {
88
- "content": "ह",
89
- "lstrip": false,
90
- "normalized": false,
91
- "rstrip": false,
92
- "single_word": false
93
- },
94
- {
95
- "content": "अ अ",
96
- "lstrip": false,
97
- "normalized": false,
98
- "rstrip": false,
99
- "single_word": false
100
- },
101
- {
102
- "content": "हूं हूं",
103
- "lstrip": false,
104
- "normalized": false,
105
- "rstrip": false,
106
- "single_word": false
107
- },
108
- {
109
- "content": "अहं",
110
- "lstrip": false,
111
- "normalized": false,
112
- "rstrip": false,
113
- "single_word": false
114
- },
115
- {
116
- "content": "ह ह",
117
- "lstrip": false,
118
- "normalized": false,
119
- "rstrip": false,
120
- "single_word": false
121
- },
122
- {
123
- "content": "आंह",
124
- "lstrip": false,
125
- "normalized": false,
126
- "rstrip": false,
127
- "single_word": false
128
- },
129
- {
130
- "content": "हहह",
131
- "lstrip": false,
132
- "normalized": false,
133
- "rstrip": false,
134
- "single_word": false
135
- },
136
- {
137
- "content": "उ",
138
- "lstrip": false,
139
- "normalized": false,
140
- "rstrip": false,
141
- "single_word": false
142
- },
143
- {
144
- "content": "ह ह ह",
145
- "lstrip": false,
146
- "normalized": false,
147
- "rstrip": false,
148
- "single_word": false
149
- },
150
- {
151
- "content": "अन्न",
152
- "lstrip": false,
153
- "normalized": false,
154
- "rstrip": false,
155
- "single_word": false
156
- },
157
- {
158
- "content": "हुह",
159
- "lstrip": false,
160
- "normalized": false,
161
- "rstrip": false,
162
- "single_word": false
163
- },
164
- {
165
- "content": "उम्म",
166
- "lstrip": false,
167
- "normalized": false,
168
- "rstrip": false,
169
- "single_word": false
170
- },
171
- {
172
- "content": "अह",
173
- "lstrip": false,
174
- "normalized": false,
175
- "rstrip": false,
176
- "single_word": false
177
- },
178
- {
179
- "content": "हम्म",
180
- "lstrip": false,
181
- "normalized": false,
182
- "rstrip": false,
183
- "single_word": false
184
- },
185
- {
186
- "content": "आं",
187
- "lstrip": false,
188
- "normalized": false,
189
- "rstrip": false,
190
- "single_word": false
191
- },
192
- {
193
- "content": "हुं हुं",
194
- "lstrip": false,
195
- "normalized": false,
196
- "rstrip": false,
197
- "single_word": false
198
- },
199
- {
200
- "content": "अं",
201
- "lstrip": false,
202
- "normalized": false,
203
- "rstrip": false,
204
- "single_word": false
205
- }
206
  ],
207
  "bos_token": {
208
  "content": "<|endoftext|>",
 
1
  {
2
  "additional_special_tokens": [
3
+ "<|endoftext|>",
4
+ "<|startoftranscript|>",
5
+ "<|en|>",
6
+ "<|zh|>",
7
+ "<|de|>",
8
+ "<|es|>",
9
+ "<|ru|>",
10
+ "<|ko|>",
11
+ "<|fr|>",
12
+ "<|ja|>",
13
+ "<|pt|>",
14
+ "<|tr|>",
15
+ "<|pl|>",
16
+ "<|ca|>",
17
+ "<|nl|>",
18
+ "<|ar|>",
19
+ "<|sv|>",
20
+ "<|it|>",
21
+ "<|id|>",
22
+ "<|hi|>",
23
+ "<|fi|>",
24
+ "<|vi|>",
25
+ "<|he|>",
26
+ "<|uk|>",
27
+ "<|el|>",
28
+ "<|ms|>",
29
+ "<|cs|>",
30
+ "<|ro|>",
31
+ "<|da|>",
32
+ "<|hu|>",
33
+ "<|ta|>",
34
+ "<|no|>",
35
+ "<|th|>",
36
+ "<|ur|>",
37
+ "<|hr|>",
38
+ "<|bg|>",
39
+ "<|lt|>",
40
+ "<|la|>",
41
+ "<|mi|>",
42
+ "<|ml|>",
43
+ "<|cy|>",
44
+ "<|sk|>",
45
+ "<|te|>",
46
+ "<|fa|>",
47
+ "<|lv|>",
48
+ "<|bn|>",
49
+ "<|sr|>",
50
+ "<|az|>",
51
+ "<|sl|>",
52
+ "<|kn|>",
53
+ "<|et|>",
54
+ "<|mk|>",
55
+ "<|br|>",
56
+ "<|eu|>",
57
+ "<|is|>",
58
+ "<|hy|>",
59
+ "<|ne|>",
60
+ "<|mn|>",
61
+ "<|bs|>",
62
+ "<|kk|>",
63
+ "<|sq|>",
64
+ "<|sw|>",
65
+ "<|gl|>",
66
+ "<|mr|>",
67
+ "<|pa|>",
68
+ "<|si|>",
69
+ "<|km|>",
70
+ "<|sn|>",
71
+ "<|yo|>",
72
+ "<|so|>",
73
+ "<|af|>",
74
+ "<|oc|>",
75
+ "<|ka|>",
76
+ "<|be|>",
77
+ "<|tg|>",
78
+ "<|sd|>",
79
+ "<|gu|>",
80
+ "<|am|>",
81
+ "<|yi|>",
82
+ "<|lo|>",
83
+ "<|uz|>",
84
+ "<|fo|>",
85
+ "<|ht|>",
86
+ "<|ps|>",
87
+ "<|tk|>",
88
+ "<|nn|>",
89
+ "<|mt|>",
90
+ "<|sa|>",
91
+ "<|lb|>",
92
+ "<|my|>",
93
+ "<|bo|>",
94
+ "<|tl|>",
95
+ "<|mg|>",
96
+ "<|as|>",
97
+ "<|tt|>",
98
+ "<|haw|>",
99
+ "<|ln|>",
100
+ "<|ha|>",
101
+ "<|ba|>",
102
+ "<|jw|>",
103
+ "<|su|>",
104
+ "<|translate|>",
105
+ "<|transcribe|>",
106
+ "<|startoflm|>",
107
+ "<|startofprev|>",
108
+ "<|nocaptions|>",
109
+ "<|notimestamps|>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  ],
111
  "bos_token": {
112
  "content": "<|endoftext|>",
tokenizer_config.json CHANGED
@@ -12865,270 +12865,116 @@
12865
  "rstrip": false,
12866
  "single_word": false,
12867
  "special": false
12868
- },
12869
- "51865": {
12870
- "content": "अम्",
12871
- "lstrip": false,
12872
- "normalized": false,
12873
- "rstrip": false,
12874
- "single_word": false,
12875
- "special": true
12876
- },
12877
- "51866": {
12878
- "content": "हां",
12879
- "lstrip": false,
12880
- "normalized": false,
12881
- "rstrip": false,
12882
- "single_word": false,
12883
- "special": true
12884
- },
12885
- "51867": {
12886
- "content": "आँ",
12887
- "lstrip": false,
12888
- "normalized": false,
12889
- "rstrip": false,
12890
- "single_word": false,
12891
- "special": true
12892
- },
12893
- "51868": {
12894
- "content": "हह",
12895
- "lstrip": false,
12896
- "normalized": false,
12897
- "rstrip": false,
12898
- "single_word": false,
12899
- "special": true
12900
- },
12901
- "51869": {
12902
- "content": "उह",
12903
- "lstrip": false,
12904
- "normalized": false,
12905
- "rstrip": false,
12906
- "single_word": false,
12907
- "special": true
12908
- },
12909
- "51870": {
12910
- "content": "हुह्ह",
12911
- "lstrip": false,
12912
- "normalized": false,
12913
- "rstrip": false,
12914
- "single_word": false,
12915
- "special": true
12916
- },
12917
- "51871": {
12918
- "content": "ओ",
12919
- "lstrip": false,
12920
- "normalized": false,
12921
- "rstrip": false,
12922
- "single_word": false,
12923
- "special": true
12924
- },
12925
- "51872": {
12926
- "content": "अम्म",
12927
- "lstrip": false,
12928
- "normalized": false,
12929
- "rstrip": false,
12930
- "single_word": false,
12931
- "special": true
12932
- },
12933
- "51873": {
12934
- "content": "हाहा",
12935
- "lstrip": false,
12936
- "normalized": false,
12937
- "rstrip": false,
12938
- "single_word": false,
12939
- "special": true
12940
- },
12941
- "51874": {
12942
- "content": "हुंह",
12943
- "lstrip": false,
12944
- "normalized": false,
12945
- "rstrip": false,
12946
- "single_word": false,
12947
- "special": true
12948
- },
12949
- "51875": {
12950
- "content": "ह ह ह ह",
12951
- "lstrip": false,
12952
- "normalized": false,
12953
- "rstrip": false,
12954
- "single_word": false,
12955
- "special": true
12956
- },
12957
- "51876": {
12958
- "content": "ओह",
12959
- "lstrip": false,
12960
- "normalized": false,
12961
- "rstrip": false,
12962
- "single_word": false,
12963
- "special": true
12964
- },
12965
- "51877": {
12966
- "content": "ह",
12967
- "lstrip": false,
12968
- "normalized": false,
12969
- "rstrip": false,
12970
- "single_word": false,
12971
- "special": true
12972
- },
12973
- "51878": {
12974
- "content": "अ अ",
12975
- "lstrip": false,
12976
- "normalized": false,
12977
- "rstrip": false,
12978
- "single_word": false,
12979
- "special": true
12980
- },
12981
- "51879": {
12982
- "content": "हूं हूं",
12983
- "lstrip": false,
12984
- "normalized": false,
12985
- "rstrip": false,
12986
- "single_word": false,
12987
- "special": true
12988
- },
12989
- "51880": {
12990
- "content": "अहं",
12991
- "lstrip": false,
12992
- "normalized": false,
12993
- "rstrip": false,
12994
- "single_word": false,
12995
- "special": true
12996
- },
12997
- "51881": {
12998
- "content": "ह ह",
12999
- "lstrip": false,
13000
- "normalized": false,
13001
- "rstrip": false,
13002
- "single_word": false,
13003
- "special": true
13004
- },
13005
- "51882": {
13006
- "content": "आंह",
13007
- "lstrip": false,
13008
- "normalized": false,
13009
- "rstrip": false,
13010
- "single_word": false,
13011
- "special": true
13012
- },
13013
- "51883": {
13014
- "content": "हहह",
13015
- "lstrip": false,
13016
- "normalized": false,
13017
- "rstrip": false,
13018
- "single_word": false,
13019
- "special": true
13020
- },
13021
- "51884": {
13022
- "content": "उ",
13023
- "lstrip": false,
13024
- "normalized": false,
13025
- "rstrip": false,
13026
- "single_word": false,
13027
- "special": true
13028
- },
13029
- "51885": {
13030
- "content": "ह ह ह",
13031
- "lstrip": false,
13032
- "normalized": false,
13033
- "rstrip": false,
13034
- "single_word": false,
13035
- "special": true
13036
- },
13037
- "51886": {
13038
- "content": "अन्न",
13039
- "lstrip": false,
13040
- "normalized": false,
13041
- "rstrip": false,
13042
- "single_word": false,
13043
- "special": true
13044
- },
13045
- "51887": {
13046
- "content": "हुह",
13047
- "lstrip": false,
13048
- "normalized": false,
13049
- "rstrip": false,
13050
- "single_word": false,
13051
- "special": true
13052
- },
13053
- "51888": {
13054
- "content": "उम्म",
13055
- "lstrip": false,
13056
- "normalized": false,
13057
- "rstrip": false,
13058
- "single_word": false,
13059
- "special": true
13060
- },
13061
- "51889": {
13062
- "content": "अह",
13063
- "lstrip": false,
13064
- "normalized": false,
13065
- "rstrip": false,
13066
- "single_word": false,
13067
- "special": true
13068
- },
13069
- "51890": {
13070
- "content": "हम्म",
13071
- "lstrip": false,
13072
- "normalized": false,
13073
- "rstrip": false,
13074
- "single_word": false,
13075
- "special": true
13076
- },
13077
- "51891": {
13078
- "content": "आं",
13079
- "lstrip": false,
13080
- "normalized": false,
13081
- "rstrip": false,
13082
- "single_word": false,
13083
- "special": true
13084
- },
13085
- "51892": {
13086
- "content": "हुं हुं",
13087
- "lstrip": false,
13088
- "normalized": false,
13089
- "rstrip": false,
13090
- "single_word": false,
13091
- "special": true
13092
- },
13093
- "51893": {
13094
- "content": "अं",
13095
- "lstrip": false,
13096
- "normalized": false,
13097
- "rstrip": false,
13098
- "single_word": false,
13099
- "special": true
13100
  }
13101
  },
13102
  "additional_special_tokens": [
13103
- "अम्",
13104
- "हां",
13105
- "आँ",
13106
- "हह",
13107
- "उह",
13108
- "हुह्ह",
13109
- "",
13110
- "अम्म",
13111
- "हाहा",
13112
- "हुंह",
13113
- "ह ह ह ह",
13114
- "ओह",
13115
- "",
13116
- "अ अ",
13117
- "हूं हूं",
13118
- "अहं",
13119
- "ह ह",
13120
- "आंह",
13121
- "हहह",
13122
- "",
13123
- "ह ह ह",
13124
- "अन्न",
13125
- "हुह",
13126
- "उम्म",
13127
- "अह",
13128
- "हम्म",
13129
- "आं",
13130
- "हुं हुं",
13131
- "अं"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13132
  ],
13133
  "bos_token": "<|endoftext|>",
13134
  "clean_up_tokenization_spaces": true,
 
12865
  "rstrip": false,
12866
  "single_word": false,
12867
  "special": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12868
  }
12869
  },
12870
  "additional_special_tokens": [
12871
+ "<|endoftext|>",
12872
+ "<|startoftranscript|>",
12873
+ "<|en|>",
12874
+ "<|zh|>",
12875
+ "<|de|>",
12876
+ "<|es|>",
12877
+ "<|ru|>",
12878
+ "<|ko|>",
12879
+ "<|fr|>",
12880
+ "<|ja|>",
12881
+ "<|pt|>",
12882
+ "<|tr|>",
12883
+ "<|pl|>",
12884
+ "<|ca|>",
12885
+ "<|nl|>",
12886
+ "<|ar|>",
12887
+ "<|sv|>",
12888
+ "<|it|>",
12889
+ "<|id|>",
12890
+ "<|hi|>",
12891
+ "<|fi|>",
12892
+ "<|vi|>",
12893
+ "<|he|>",
12894
+ "<|uk|>",
12895
+ "<|el|>",
12896
+ "<|ms|>",
12897
+ "<|cs|>",
12898
+ "<|ro|>",
12899
+ "<|da|>",
12900
+ "<|hu|>",
12901
+ "<|ta|>",
12902
+ "<|no|>",
12903
+ "<|th|>",
12904
+ "<|ur|>",
12905
+ "<|hr|>",
12906
+ "<|bg|>",
12907
+ "<|lt|>",
12908
+ "<|la|>",
12909
+ "<|mi|>",
12910
+ "<|ml|>",
12911
+ "<|cy|>",
12912
+ "<|sk|>",
12913
+ "<|te|>",
12914
+ "<|fa|>",
12915
+ "<|lv|>",
12916
+ "<|bn|>",
12917
+ "<|sr|>",
12918
+ "<|az|>",
12919
+ "<|sl|>",
12920
+ "<|kn|>",
12921
+ "<|et|>",
12922
+ "<|mk|>",
12923
+ "<|br|>",
12924
+ "<|eu|>",
12925
+ "<|is|>",
12926
+ "<|hy|>",
12927
+ "<|ne|>",
12928
+ "<|mn|>",
12929
+ "<|bs|>",
12930
+ "<|kk|>",
12931
+ "<|sq|>",
12932
+ "<|sw|>",
12933
+ "<|gl|>",
12934
+ "<|mr|>",
12935
+ "<|pa|>",
12936
+ "<|si|>",
12937
+ "<|km|>",
12938
+ "<|sn|>",
12939
+ "<|yo|>",
12940
+ "<|so|>",
12941
+ "<|af|>",
12942
+ "<|oc|>",
12943
+ "<|ka|>",
12944
+ "<|be|>",
12945
+ "<|tg|>",
12946
+ "<|sd|>",
12947
+ "<|gu|>",
12948
+ "<|am|>",
12949
+ "<|yi|>",
12950
+ "<|lo|>",
12951
+ "<|uz|>",
12952
+ "<|fo|>",
12953
+ "<|ht|>",
12954
+ "<|ps|>",
12955
+ "<|tk|>",
12956
+ "<|nn|>",
12957
+ "<|mt|>",
12958
+ "<|sa|>",
12959
+ "<|lb|>",
12960
+ "<|my|>",
12961
+ "<|bo|>",
12962
+ "<|tl|>",
12963
+ "<|mg|>",
12964
+ "<|as|>",
12965
+ "<|tt|>",
12966
+ "<|haw|>",
12967
+ "<|ln|>",
12968
+ "<|ha|>",
12969
+ "<|ba|>",
12970
+ "<|jw|>",
12971
+ "<|su|>",
12972
+ "<|translate|>",
12973
+ "<|transcribe|>",
12974
+ "<|startoflm|>",
12975
+ "<|startofprev|>",
12976
+ "<|nocaptions|>",
12977
+ "<|notimestamps|>"
12978
  ],
12979
  "bos_token": "<|endoftext|>",
12980
  "clean_up_tokenization_spaces": true,