| { | |
| "sources": { | |
| "c4": { | |
| "provider": "huggingface", | |
| "partition": "en", | |
| "split": "train", | |
| "streaming": true, | |
| "remove_columns": null, | |
| "concatenate_successive_entries": 0 | |
| } | |
| }, | |
| "name": "c4-subset", | |
| "normalizer": { | |
| "force_lowercase": true, | |
| "strip_accents": true, | |
| "force_english_keyboard": true, | |
| "whitespace_escape": false | |
| }, | |
| "tokenizer": "WordPiece", | |
| "vocab_size": 32768, | |
| "seq_length": 128, | |
| "include_cls_token_in_corpus": false, | |
| "include_sep_token_in_corpus": true, | |
| "use_type_ids": false, | |
| "max_entries_in_raw_dataset": 25000000.0, | |
| "max_seq_in_tokenized_dataset": 85000000.0, | |
| "named_entity_simplification": false, | |
| "remove_whitespaces": false, | |
| "remove_trash": true, | |
| "trash_cutoff": 0.25, | |
| "deduplicate_entries": true, | |
| "deduplication_threshold": 75, | |
| "ordering": "sentence-length-curriculum" | |
| } |