File size: 2,373 Bytes
9dca845
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b516cba
9dca845
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
{
    "model_name": "Efficient Conformer CTC Small",
    "model_type": "CTC",

    "encoder_params": 
    {
        "arch": "Conformer",
        "num_blocks": 15,
        "dim_model": [120, 168, 240],
        "ff_ratio": 4,
        "num_heads": 4,
        "kernel_size": 15,
        "Pdrop": 0.1,
        "conv_stride": 2,
        "att_stride": 1,
        "strided_blocks": [4, 9],
        "expand_blocks": [4, 9],
        "att_group_size": [3, 1, 1],

        "relative_pos_enc": true,
        "max_pos_encoding": 10000,

        "subsampling_module": "Conv2d",
        "subsampling_layers": 1,
        "subsampling_filters": [120],
        "subsampling_kernel_size": 3,
        "subsampling_norm": "batch",
        "subsampling_act": "swish",

        "sample_rate": 16000,
        "win_length_ms": 25,
        "hop_length_ms": 10,
        "n_fft": 512,
        "n_mels": 80,
        "normalize": false,
        "mean": -5.6501,
        "std": 4.2280,

        "spec_augment": false,
        "mF": 2,
        "F": 27,
        "mT": 5,
        "pS": 0.05
    },
    
    "tokenizer_params":
    {
        "tokenizer_path": "datasets/Vietnamese/vi_bpe_1024.model",
        "vocab_type": "bpe",
        "vocab_size": 1024
    },

    "training_params":
    {
        "epochs": 450,
        "batch_size": 64,
        "accumulated_steps": 2,
        "mixed_precision": true,

        "optimizer": "Adam",
        "beta1": 0.9,
        "beta2": 0.98,
        "eps": 1e-9,
        "weight_decay": 1e-6,

        "lr_schedule": "Transformer",
        "schedule_dim": 240,
        "warmup_steps": 10000,
        "K": 2,

        "train_audio_max_length": 256000,
        "train_label_max_length": 256000,
        "eval_audio_max_length": 256000,
        "eval_label_max_length": 256000,

        "training_dataset": "Vietnamese",
        "training_dataset_path": "/mnt/c/Users/hyngu/Data/ASRDataset/",

        "evaluation_dataset": "Vietnamese",
        "evaluation_dataset_path": "/mnt/c/Users/hyngu/Data/ASRDataset/",

        "callback_path": "callbacks/EfficientConformerCTCSmall/"
    },

    "decoding_params":
    {
        "beam_size": 15,
        "tmp": 1,

        "ngram_path": "data/6gram_lm_corpus.binary",
        "ngram_alpha": 0.4,
        "ngram_beta": 1.0,

        "lm_config": "configs/LM-Transformer.json",
        "lm_weight": 1,
        "lm_tmp": 1
    }
}