{ "model_name": "Efficient Conformer CTC Small", "model_type": "CTC", "encoder_params": { "arch": "Conformer", "num_blocks": 15, "dim_model": [120, 168, 240], "ff_ratio": 4, "num_heads": 4, "kernel_size": 15, "Pdrop": 0.1, "conv_stride": 2, "att_stride": 1, "strided_blocks": [4, 9], "expand_blocks": [4, 9], "att_group_size": [3, 1, 1], "relative_pos_enc": true, "max_pos_encoding": 10000, "subsampling_module": "Conv2d", "subsampling_layers": 1, "subsampling_filters": [120], "subsampling_kernel_size": 3, "subsampling_norm": "batch", "subsampling_act": "swish", "sample_rate": 16000, "win_length_ms": 25, "hop_length_ms": 10, "n_fft": 512, "n_mels": 80, "normalize": false, "mean": -5.6501, "std": 4.2280, "spec_augment": false, "mF": 2, "F": 27, "mT": 5, "pS": 0.05 }, "tokenizer_params": { "tokenizer_path": "datasets/Vietnamese/vi_bpe_1024.model", "vocab_type": "bpe", "vocab_size": 1024 }, "training_params": { "epochs": 450, "batch_size": 64, "accumulated_steps": 2, "mixed_precision": true, "optimizer": "Adam", "beta1": 0.9, "beta2": 0.98, "eps": 1e-9, "weight_decay": 1e-6, "lr_schedule": "Transformer", "schedule_dim": 240, "warmup_steps": 10000, "K": 2, "train_audio_max_length": 256000, "train_label_max_length": 256000, "eval_audio_max_length": 256000, "eval_label_max_length": 256000, "training_dataset": "Vietnamese", "training_dataset_path": "/mnt/c/Users/hyngu/Data/ASRDataset/", "evaluation_dataset": "Vietnamese", "evaluation_dataset_path": "/mnt/c/Users/hyngu/Data/ASRDataset/", "callback_path": "callbacks/EfficientConformerCTCSmall/" }, "decoding_params": { "beam_size": 15, "tmp": 1, "ngram_path": "data/6gram_lm_corpus.binary", "ngram_alpha": 0.4, "ngram_beta": 1.0, "lm_config": "configs/LM-Transformer.json", "lm_weight": 1, "lm_tmp": 1 } }