# Generated 2025-08-16 from: # /home/adelmou/proj/speechbrain/speechllm_librispeech/speechbrain/recipes/LibriSpeech/ASR/transformer/hparams/llama.yaml # yamllint disable # ############################################################################ # Authors: Adel Moumen # ############################################################################ # Seed needs to be set at top of yaml, before objects with parameters are made seed: 3407 __set_seed: !apply:speechbrain.utils.seed_everything [3407] experiment_name: llama_3.2_1b_ASR output_folder: /scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora/ output_wer_folder: /scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//wer_results save_folder: /scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//save train_log: /scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//train_log.txt # Data files data_folder: /localscratch/adelmou.48174029.0/LibriSpeech/ # e.g., /path/to/LibriSpeech # If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES # then data_folder_rirs should be /localscratch/xxx_corpus # otherwise the dataset will automatically be downloaded # data_folder_rirs: !ref train_splits: [train-clean-100, train-clean-360, train-other-500] # , "train-clean-360", "train-other-500" dev_splits: [dev-clean] test_splits: [test-clean, test-other] skip_prep: false train_csv: /scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//train.csv valid_csv: /scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//dev-clean.csv test_csv: - /scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//test-clean.csv - /scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//test-other.csv ckpt_interval_minutes: 15 # save checkpoint every N min ####################### Training Parameters #################################### # URL for the HuggingFace model we want to load (BASE here) ssl_hub: /localscratch/adelmou.48174029.0/wavlm-large/ ssl_folder: /scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//save/ssl_checkpoint ssl_frozen: true # LLM options llm_path: /localscratch/adelmou.48174029.0/SmolLM2-1.7B/ llm_emb_size: 2048 number_of_epochs: 20 batch_size: 32 # Only used if dynamic batching is off. # 400s * 10 => 1h / opt step grad_accumulation_factor: 5 loss_reduction: batchmean sorting: random num_workers: 4 precision: bf16 # bf16, fp16 or fp32 eval_precision: bf16 max_grad_norm: 1.0 # stages related parameters initial_lr: 0.0005 final_lr: 0.00001 lr_wav2vec: 0.00002 weight_decay: 0.0 warmup_steps: 5000 augment_warmup: 7500 # BPE parameters token_type: unigram # ["unigram", "bpe", "char"] character_coverage: 1.0 # Feature parameters sample_rate: 16000 downsampling_factor: 5 # Used to downsample frames before llm projection. # This setup works well for A100 80GB GPU, adapts it to your needs. # Or turn it off (but training speed will decrease) dynamic_batching: true max_batch_length_train: 300 max_batch_length_val: 100 # we reduce it as the beam is much wider (VRAM) num_bucket: 200 shuffle: true # if true re-creates batches at each epoch shuffling examples. batch_ordering: random max_batch_ex: 256 dynamic_batch_sampler_train: max_batch_length: 300 num_buckets: 200 shuffle: true batch_ordering: random max_batch_ex: 256 dynamic_batch_sampler_valid: max_batch_length: 100 num_buckets: 200 shuffle: true batch_ordering: random max_batch_ex: 256 # Dataloader options train_dataloader_opts: batch_size: 32 shuffle: true num_workers: 4 collate_fn: !name:speechbrain.dataio.batch.PaddedBatch padding_kwargs: value: 49152 per_key_padding_kwargs: sig: value: 0 tokens_eos: value: -100 valid_dataloader_opts: batch_size: 8 collate_fn: !name:speechbrain.dataio.batch.PaddedBatch padding_kwargs: value: 49152 per_key_padding_kwargs: sig: value: 0 tokens_eos: value: -100 test_dataloader_opts: batch_size: 8 collate_fn: !name:speechbrain.dataio.batch.PaddedBatch padding_kwargs: value: 49152 per_key_padding_kwargs: sig: value: 0 tokens_eos: value: -100 ####################### Model Parameters ########################### activation: &id001 !name:speechbrain.nnet.activations.Swish asr_output_neurons: 1024 lora_rank: 16 # Frames - LLM projector params dnn_layers: 2 dnn_neurons: 1024 downsampling_output_dim: 5120 avg_checkpoints: 3 # Outputs blank_index: 0 pad_token: 49152 #Llama 3 pad index after adding: BEURK. # Decoding parameters valid_search_interval: 4 valid_beam_size: 1 # We do greedy here so it's faster to decode ... test_beam_size: 5 ############################## models ################################ normalize: &id008 !new:speechbrain.processing.features.InputNormalization norm_type: sentence #wav2vec model ssl: &id003 !new:speechbrain.integrations.huggingface.wav2vec2.Wav2Vec2 source: /localscratch/adelmou.48174029.0/wavlm-large/ output_norm: true freeze: true save_path: /scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//save/ssl_checkpoint device_map: cuda # attn_implementation: sdpa # normalize_wav: False linear_projection: &id007 !new:torch.nn.Linear in_features: 5120 out_features: 2048 bias: false gated_nn: &id006 !new:speechbrain.lobes.models.GatedNN.GatedNN input_shape: [null, null, 2048] # 5 x 1024 activation: *id001 blocks: 2 neurons: 1024 output_hidden_states: false backbone_llm: &id002 !new:speechbrain.integrations.huggingface.llama.LLaMA source: /localscratch/adelmou.48174029.0/SmolLM2-1.7B/ save_path: /scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//save freeze: true attn_implementation: flash_attention_2 device: cuda torch_dtype: !name:torch.bfloat16 additional_special_tokens: - <|start_of_audio|> - <|end_of_audio|> output_hidden_states: false # Simply uncomment if you want to use LoRA adaptation. llm: &id005 !new:speechbrain.nnet.adapters.AdaptedModel model_to_adapt: *id002 adapter_class: !name:speechbrain.nnet.adapters.LoRA all_linear: true adapter_kwargs: rank: 16 feat_downsampler: &id004 !new:speechbrain.lobes.downsampling.ConcatDownsampler downsampling_factor: 5 modules: ssl: *id003 feat_downsampler: *id004 llm: *id005 gated_nn: *id006 linear_projection: *id007 normalize: *id008 model: &id009 !new:torch.nn.ModuleList - [*id003, *id005, *id006, *id007] Adam: !name:torch.optim.AdamW lr: 0.0005 weight_decay: 0.0 Adam_wav2vec2: !name:torch.optim.AdamW lr: 0.00002 weight_decay: 0.0 scheduler: !new:speechbrain.nnet.schedulers.LinearScheduler initial_value: 0.0005 final_value: 0.00001 epoch_count: 20 lr_annealing_wav2vec: &id010 !new:speechbrain.nnet.schedulers.NewBobScheduler initial_value: 0.00002 improvement_threshold: 0.0025 annealing_factor: 0.8 patient: 1 checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: /scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//save recoverables: model: *id009 lr_annealing_wav2vec: *id010 counter: &id011 !new:speechbrain.utils.epoch_loop.EpochCounter limit: 20 normalize: *id008 epoch_counter: *id011 train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: /scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//train_log.txt cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats split_tokens: true error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats