# Generated 2025-08-16 from:
# /home/adelmou/proj/speechbrain/speechllm_librispeech/speechbrain/recipes/LibriSpeech/ASR/transformer/hparams/llama.yaml
# yamllint disable
# ############################################################################
# Authors:  Adel Moumen
# ############################################################################
# Seed needs to be set at top of yaml, before objects with parameters are made
seed: 3407
__set_seed: !apply:speechbrain.utils.seed_everything [3407]
experiment_name: llama_3.2_1b_ASR
output_folder: /scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora/
output_wer_folder: 
  /scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//wer_results
save_folder: 
  /scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//save
train_log: 
  /scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//train_log.txt


# Data files
data_folder: /localscratch/adelmou.48174029.0/LibriSpeech/
                          # e.g., /path/to/LibriSpeech
# If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES
# then data_folder_rirs should be /localscratch/xxx_corpus
# otherwise the dataset will automatically be downloaded
# data_folder_rirs: !ref <data_folder>
train_splits: [train-clean-100, train-clean-360, train-other-500]       # , "train-clean-360", "train-other-500"
dev_splits: [dev-clean]
test_splits: [test-clean, test-other]
skip_prep: false
train_csv: 
  /scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//train.csv
valid_csv: 
  /scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//dev-clean.csv
test_csv:
- /scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//test-clean.csv
- /scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//test-other.csv

ckpt_interval_minutes: 15 # save checkpoint every N min

####################### Training Parameters ####################################

# URL for the HuggingFace model we want to load (BASE here)
ssl_hub: /localscratch/adelmou.48174029.0/wavlm-large/
ssl_folder: 
  /scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//save/ssl_checkpoint
ssl_frozen: true

# LLM options
llm_path: /localscratch/adelmou.48174029.0/SmolLM2-1.7B/
llm_emb_size: 2048

number_of_epochs: 20
batch_size: 32 # Only used if dynamic batching is off.
# 400s * 10 => 1h / opt step
grad_accumulation_factor: 5
loss_reduction: batchmean
sorting: random
num_workers: 4
precision: bf16 # bf16, fp16 or fp32
eval_precision: bf16
max_grad_norm: 1.0

# stages related parameters
initial_lr: 0.0005
final_lr: 0.00001
lr_wav2vec: 0.00002

weight_decay: 0.0
warmup_steps: 5000
augment_warmup: 7500

# BPE parameters
token_type: unigram  # ["unigram", "bpe", "char"]
character_coverage: 1.0

# Feature parameters
sample_rate: 16000
downsampling_factor: 5 # Used to downsample frames before llm projection.

# This setup works well for A100 80GB GPU, adapts it to your needs.
# Or turn it off (but training speed will decrease)
dynamic_batching: true
max_batch_length_train: 300
max_batch_length_val: 100 # we reduce it as the beam is much wider (VRAM)
num_bucket: 200
shuffle: true # if true re-creates batches at each epoch shuffling examples.
batch_ordering: random
max_batch_ex: 256

dynamic_batch_sampler_train:
  max_batch_length: 300
  num_buckets: 200
  shuffle: true
  batch_ordering: random
  max_batch_ex: 256

dynamic_batch_sampler_valid:
  max_batch_length: 100
  num_buckets: 200
  shuffle: true
  batch_ordering: random
  max_batch_ex: 256

# Dataloader options
train_dataloader_opts:
  batch_size: 32
  shuffle: true
  num_workers: 4
  collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
    padding_kwargs:
      value: 49152
    per_key_padding_kwargs:
      sig:
        value: 0
      tokens_eos:
        value: -100

valid_dataloader_opts:
  batch_size: 8
  collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
    padding_kwargs:
      value: 49152
    per_key_padding_kwargs:
      sig:
        value: 0
      tokens_eos:
        value: -100

test_dataloader_opts:
  batch_size: 8
  collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
    padding_kwargs:
      value: 49152
    per_key_padding_kwargs:
      sig:
        value: 0
      tokens_eos:
        value: -100


####################### Model Parameters ###########################
activation: &id001 !name:speechbrain.nnet.activations.Swish
asr_output_neurons: 1024
lora_rank: 16

# Frames - LLM projector params
dnn_layers: 2
dnn_neurons: 1024
downsampling_output_dim: 5120
avg_checkpoints: 3

# Outputs
blank_index: 0
pad_token: 49152  #Llama 3 pad index after adding: BEURK.

# Decoding parameters
valid_search_interval: 4
valid_beam_size: 1 # We do greedy here so it's faster to decode ...
test_beam_size: 5

############################## models ################################

normalize: &id008 !new:speechbrain.processing.features.InputNormalization

  norm_type: sentence

#wav2vec model
ssl: &id003 !new:speechbrain.integrations.huggingface.wav2vec2.Wav2Vec2
  source: /localscratch/adelmou.48174029.0/wavlm-large/
  output_norm: true
  freeze: true
  save_path: 
    /scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//save/ssl_checkpoint
  device_map: cuda
    # attn_implementation: sdpa
    # normalize_wav: False

linear_projection: &id007 !new:torch.nn.Linear
  in_features: 5120
  out_features: 2048
  bias: false

gated_nn: &id006 !new:speechbrain.lobes.models.GatedNN.GatedNN
  input_shape: [null, null, 2048]                  # 5 x 1024
  activation: *id001
  blocks: 2
  neurons: 1024

output_hidden_states: false
backbone_llm: &id002 !new:speechbrain.integrations.huggingface.llama.LLaMA
  source: /localscratch/adelmou.48174029.0/SmolLM2-1.7B/
  save_path: 
    /scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//save
  freeze: true
  attn_implementation: flash_attention_2
  device: cuda
  torch_dtype: !name:torch.bfloat16
  additional_special_tokens:
  - <|start_of_audio|>
  - <|end_of_audio|>
  output_hidden_states: false

# Simply uncomment if you want to use LoRA adaptation.
llm: &id005 !new:speechbrain.nnet.adapters.AdaptedModel
  model_to_adapt: *id002
  adapter_class: !name:speechbrain.nnet.adapters.LoRA
  all_linear: true
  adapter_kwargs:
    rank: 16

feat_downsampler: &id004 !new:speechbrain.lobes.downsampling.ConcatDownsampler
  downsampling_factor: 5

modules:
  ssl: *id003
  feat_downsampler: *id004
  llm: *id005
  gated_nn: *id006
  linear_projection: *id007
  normalize: *id008
model: &id009 !new:torch.nn.ModuleList
- [*id003, *id005, *id006, *id007]
Adam: !name:torch.optim.AdamW
  lr: 0.0005
  weight_decay: 0.0

Adam_wav2vec2: !name:torch.optim.AdamW
  lr: 0.00002
  weight_decay: 0.0


scheduler: !new:speechbrain.nnet.schedulers.LinearScheduler
  initial_value: 0.0005
  final_value: 0.00001
  epoch_count: 20

lr_annealing_wav2vec: &id010 !new:speechbrain.nnet.schedulers.NewBobScheduler
  initial_value: 0.00002
  improvement_threshold: 0.0025
  annealing_factor: 0.8
  patient: 1


checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
  checkpoints_dir: 
    /scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//save
  recoverables:
    model: *id009
    lr_annealing_wav2vec: *id010
    counter: &id011 !new:speechbrain.utils.epoch_loop.EpochCounter
      limit: 20

    normalize: *id008
epoch_counter: *id011
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
  save_file: 
    /scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//train_log.txt

cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
  split_tokens: true
error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats