Adel-Moumen's picture
Add files using upload-large-folder tool
be7aa66 verified
# Generated 2025-08-16 from:
# /home/adelmou/proj/speechbrain/speechllm_librispeech/speechbrain/recipes/LibriSpeech/ASR/transformer/hparams/llama.yaml
# yamllint disable
# ############################################################################
# Authors: Adel Moumen
# ############################################################################
# Seed needs to be set at top of yaml, before objects with parameters are made
seed: 3407
__set_seed: !apply:speechbrain.utils.seed_everything [3407]
experiment_name: llama_3.2_1b_ASR
output_folder: /scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora/
output_wer_folder:
/scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//wer_results
save_folder:
/scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//save
train_log:
/scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//train_log.txt
# Data files
data_folder: /localscratch/adelmou.48174029.0/LibriSpeech/
# e.g., /path/to/LibriSpeech
# If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES
# then data_folder_rirs should be /localscratch/xxx_corpus
# otherwise the dataset will automatically be downloaded
# data_folder_rirs: !ref <data_folder>
train_splits: [train-clean-100, train-clean-360, train-other-500] # , "train-clean-360", "train-other-500"
dev_splits: [dev-clean]
test_splits: [test-clean, test-other]
skip_prep: false
train_csv:
/scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//train.csv
valid_csv:
/scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//dev-clean.csv
test_csv:
- /scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//test-clean.csv
- /scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//test-other.csv
ckpt_interval_minutes: 15 # save checkpoint every N min
####################### Training Parameters ####################################
# URL for the HuggingFace model we want to load (BASE here)
ssl_hub: /localscratch/adelmou.48174029.0/wavlm-large/
ssl_folder:
/scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//save/ssl_checkpoint
ssl_frozen: true
# LLM options
llm_path: /localscratch/adelmou.48174029.0/SmolLM2-1.7B/
llm_emb_size: 2048
number_of_epochs: 20
batch_size: 32 # Only used if dynamic batching is off.
# 400s * 10 => 1h / opt step
grad_accumulation_factor: 5
loss_reduction: batchmean
sorting: random
num_workers: 4
precision: bf16 # bf16, fp16 or fp32
eval_precision: bf16
max_grad_norm: 1.0
# stages related parameters
initial_lr: 0.0005
final_lr: 0.00001
lr_wav2vec: 0.00002
weight_decay: 0.0
warmup_steps: 5000
augment_warmup: 7500
# BPE parameters
token_type: unigram # ["unigram", "bpe", "char"]
character_coverage: 1.0
# Feature parameters
sample_rate: 16000
downsampling_factor: 5 # Used to downsample frames before llm projection.
# This setup works well for A100 80GB GPU, adapts it to your needs.
# Or turn it off (but training speed will decrease)
dynamic_batching: true
max_batch_length_train: 300
max_batch_length_val: 100 # we reduce it as the beam is much wider (VRAM)
num_bucket: 200
shuffle: true # if true re-creates batches at each epoch shuffling examples.
batch_ordering: random
max_batch_ex: 256
dynamic_batch_sampler_train:
max_batch_length: 300
num_buckets: 200
shuffle: true
batch_ordering: random
max_batch_ex: 256
dynamic_batch_sampler_valid:
max_batch_length: 100
num_buckets: 200
shuffle: true
batch_ordering: random
max_batch_ex: 256
# Dataloader options
train_dataloader_opts:
batch_size: 32
shuffle: true
num_workers: 4
collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
padding_kwargs:
value: 49152
per_key_padding_kwargs:
sig:
value: 0
tokens_eos:
value: -100
valid_dataloader_opts:
batch_size: 8
collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
padding_kwargs:
value: 49152
per_key_padding_kwargs:
sig:
value: 0
tokens_eos:
value: -100
test_dataloader_opts:
batch_size: 8
collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
padding_kwargs:
value: 49152
per_key_padding_kwargs:
sig:
value: 0
tokens_eos:
value: -100
####################### Model Parameters ###########################
activation: &id001 !name:speechbrain.nnet.activations.Swish
asr_output_neurons: 1024
lora_rank: 16
# Frames - LLM projector params
dnn_layers: 2
dnn_neurons: 1024
downsampling_output_dim: 5120
avg_checkpoints: 3
# Outputs
blank_index: 0
pad_token: 49152 #Llama 3 pad index after adding: BEURK.
# Decoding parameters
valid_search_interval: 4
valid_beam_size: 1 # We do greedy here so it's faster to decode ...
test_beam_size: 5
############################## models ################################
normalize: &id008 !new:speechbrain.processing.features.InputNormalization
norm_type: sentence
#wav2vec model
ssl: &id003 !new:speechbrain.integrations.huggingface.wav2vec2.Wav2Vec2
source: /localscratch/adelmou.48174029.0/wavlm-large/
output_norm: true
freeze: true
save_path:
/scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//save/ssl_checkpoint
device_map: cuda
# attn_implementation: sdpa
# normalize_wav: False
linear_projection: &id007 !new:torch.nn.Linear
in_features: 5120
out_features: 2048
bias: false
gated_nn: &id006 !new:speechbrain.lobes.models.GatedNN.GatedNN
input_shape: [null, null, 2048] # 5 x 1024
activation: *id001
blocks: 2
neurons: 1024
output_hidden_states: false
backbone_llm: &id002 !new:speechbrain.integrations.huggingface.llama.LLaMA
source: /localscratch/adelmou.48174029.0/SmolLM2-1.7B/
save_path:
/scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//save
freeze: true
attn_implementation: flash_attention_2
device: cuda
torch_dtype: !name:torch.bfloat16
additional_special_tokens:
- <|start_of_audio|>
- <|end_of_audio|>
output_hidden_states: false
# Simply uncomment if you want to use LoRA adaptation.
llm: &id005 !new:speechbrain.nnet.adapters.AdaptedModel
model_to_adapt: *id002
adapter_class: !name:speechbrain.nnet.adapters.LoRA
all_linear: true
adapter_kwargs:
rank: 16
feat_downsampler: &id004 !new:speechbrain.lobes.downsampling.ConcatDownsampler
downsampling_factor: 5
modules:
ssl: *id003
feat_downsampler: *id004
llm: *id005
gated_nn: *id006
linear_projection: *id007
normalize: *id008
model: &id009 !new:torch.nn.ModuleList
- [*id003, *id005, *id006, *id007]
Adam: !name:torch.optim.AdamW
lr: 0.0005
weight_decay: 0.0
Adam_wav2vec2: !name:torch.optim.AdamW
lr: 0.00002
weight_decay: 0.0
scheduler: !new:speechbrain.nnet.schedulers.LinearScheduler
initial_value: 0.0005
final_value: 0.00001
epoch_count: 20
lr_annealing_wav2vec: &id010 !new:speechbrain.nnet.schedulers.NewBobScheduler
initial_value: 0.00002
improvement_threshold: 0.0025
annealing_factor: 0.8
patient: 1
checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
checkpoints_dir:
/scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//save
recoverables:
model: *id009
lr_annealing_wav2vec: *id010
counter: &id011 !new:speechbrain.utils.epoch_loop.EpochCounter
limit: 20
normalize: *id008
epoch_counter: *id011
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
save_file:
/scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//train_log.txt
cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
split_tokens: true
error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats