Kaspar Beelen
commited on
Commit
·
cd57bf1
1
Parent(s):
ad792cc
add tokenizer
Browse files- added_tokens.json +84 -0
- special_tokens_map.json +13 -0
- tokenizer.json +0 -0
- tokenizer_config.json +14 -0
- vocab.txt +0 -0
added_tokens.json
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"[1801]": 30579,
|
| 3 |
+
"[1802]": 30591,
|
| 4 |
+
"[1803]": 30577,
|
| 5 |
+
"[1804]": 30589,
|
| 6 |
+
"[1805]": 30596,
|
| 7 |
+
"[1806]": 30547,
|
| 8 |
+
"[1807]": 30532,
|
| 9 |
+
"[1808]": 30555,
|
| 10 |
+
"[1809]": 30530,
|
| 11 |
+
"[1810]": 30593,
|
| 12 |
+
"[1811]": 30584,
|
| 13 |
+
"[1812]": 30581,
|
| 14 |
+
"[1813]": 30553,
|
| 15 |
+
"[1814]": 30573,
|
| 16 |
+
"[1815]": 30536,
|
| 17 |
+
"[1816]": 30568,
|
| 18 |
+
"[1817]": 30587,
|
| 19 |
+
"[1818]": 30570,
|
| 20 |
+
"[1819]": 30586,
|
| 21 |
+
"[1820]": 30578,
|
| 22 |
+
"[1821]": 30597,
|
| 23 |
+
"[1822]": 30557,
|
| 24 |
+
"[1823]": 30561,
|
| 25 |
+
"[1824]": 30566,
|
| 26 |
+
"[1825]": 30569,
|
| 27 |
+
"[1826]": 30595,
|
| 28 |
+
"[1827]": 30580,
|
| 29 |
+
"[1828]": 30594,
|
| 30 |
+
"[1829]": 30582,
|
| 31 |
+
"[1830]": 30583,
|
| 32 |
+
"[1831]": 30534,
|
| 33 |
+
"[1832]": 30588,
|
| 34 |
+
"[1833]": 30590,
|
| 35 |
+
"[1834]": 30539,
|
| 36 |
+
"[1835]": 30565,
|
| 37 |
+
"[1836]": 30567,
|
| 38 |
+
"[1837]": 30549,
|
| 39 |
+
"[1838]": 30585,
|
| 40 |
+
"[1839]": 30592,
|
| 41 |
+
"[1840]": 30562,
|
| 42 |
+
"[1841]": 30541,
|
| 43 |
+
"[1842]": 30575,
|
| 44 |
+
"[1843]": 30598,
|
| 45 |
+
"[1844]": 30552,
|
| 46 |
+
"[1845]": 30554,
|
| 47 |
+
"[1846]": 30544,
|
| 48 |
+
"[1847]": 30558,
|
| 49 |
+
"[1848]": 30533,
|
| 50 |
+
"[1849]": 30531,
|
| 51 |
+
"[1850]": 30543,
|
| 52 |
+
"[1851]": 30559,
|
| 53 |
+
"[1852]": 30550,
|
| 54 |
+
"[1853]": 30551,
|
| 55 |
+
"[1854]": 30556,
|
| 56 |
+
"[1855]": 30542,
|
| 57 |
+
"[1856]": 30548,
|
| 58 |
+
"[1857]": 30563,
|
| 59 |
+
"[1858]": 30571,
|
| 60 |
+
"[1859]": 30529,
|
| 61 |
+
"[1860]": 30564,
|
| 62 |
+
"[1861]": 30538,
|
| 63 |
+
"[1862]": 30537,
|
| 64 |
+
"[1863]": 30546,
|
| 65 |
+
"[1864]": 30572,
|
| 66 |
+
"[1865]": 30535,
|
| 67 |
+
"[1866]": 30545,
|
| 68 |
+
"[1867]": 30560,
|
| 69 |
+
"[1868]": 30540,
|
| 70 |
+
"[1869]": 30576,
|
| 71 |
+
"[1870]": 30574,
|
| 72 |
+
"[1871]": 30599,
|
| 73 |
+
"[LOC]": 30603,
|
| 74 |
+
"[MET]": 30600,
|
| 75 |
+
"[POL]": 30602,
|
| 76 |
+
"[YEAR]": 30601,
|
| 77 |
+
"[con]": 30523,
|
| 78 |
+
"[lib]": 30522,
|
| 79 |
+
"[liverpool]": 30528,
|
| 80 |
+
"[london]": 30527,
|
| 81 |
+
"[neutr]": 30526,
|
| 82 |
+
"[none]": 30524,
|
| 83 |
+
"[rad]": 30525
|
| 84 |
+
}
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"additional_special_tokens": [
|
| 3 |
+
"[MET]",
|
| 4 |
+
"[YEAR]",
|
| 5 |
+
"[POL]",
|
| 6 |
+
"[LOC]"
|
| 7 |
+
],
|
| 8 |
+
"cls_token": "[CLS]",
|
| 9 |
+
"mask_token": "[MASK]",
|
| 10 |
+
"pad_token": "[PAD]",
|
| 11 |
+
"sep_token": "[SEP]",
|
| 12 |
+
"unk_token": "[UNK]"
|
| 13 |
+
}
|
tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cls_token": "[CLS]",
|
| 3 |
+
"do_lower_case": true,
|
| 4 |
+
"mask_token": "[MASK]",
|
| 5 |
+
"model_max_length": 512,
|
| 6 |
+
"name_or_path": "erwt-year-st",
|
| 7 |
+
"pad_token": "[PAD]",
|
| 8 |
+
"sep_token": "[SEP]",
|
| 9 |
+
"special_tokens_map_file": null,
|
| 10 |
+
"strip_accents": null,
|
| 11 |
+
"tokenize_chinese_chars": true,
|
| 12 |
+
"tokenizer_class": "DistilBertTokenizer",
|
| 13 |
+
"unk_token": "[UNK]"
|
| 14 |
+
}
|
vocab.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|