|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
seed: 3407 |
|
|
__set_seed: !apply:speechbrain.utils.seed_everything [3407] |
|
|
experiment_name: llama_3.2_1b_ASR |
|
|
output_folder: /scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora/ |
|
|
output_wer_folder: |
|
|
/scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//wer_results |
|
|
save_folder: |
|
|
/scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//save |
|
|
train_log: |
|
|
/scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//train_log.txt |
|
|
|
|
|
|
|
|
|
|
|
data_folder: /localscratch/adelmou.48174029.0/LibriSpeech/ |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_splits: [train-clean-100, train-clean-360, train-other-500] |
|
|
dev_splits: [dev-clean] |
|
|
test_splits: [test-clean, test-other] |
|
|
skip_prep: false |
|
|
train_csv: |
|
|
/scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//train.csv |
|
|
valid_csv: |
|
|
/scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//dev-clean.csv |
|
|
test_csv: |
|
|
- /scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//test-clean.csv |
|
|
- /scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//test-other.csv |
|
|
|
|
|
ckpt_interval_minutes: 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ssl_hub: /localscratch/adelmou.48174029.0/wavlm-large/ |
|
|
ssl_folder: |
|
|
/scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//save/ssl_checkpoint |
|
|
ssl_frozen: true |
|
|
|
|
|
|
|
|
llm_path: /localscratch/adelmou.48174029.0/SmolLM2-1.7B/ |
|
|
llm_emb_size: 2048 |
|
|
|
|
|
number_of_epochs: 20 |
|
|
batch_size: 32 |
|
|
|
|
|
grad_accumulation_factor: 5 |
|
|
loss_reduction: batchmean |
|
|
sorting: random |
|
|
num_workers: 4 |
|
|
precision: bf16 |
|
|
eval_precision: bf16 |
|
|
max_grad_norm: 1.0 |
|
|
|
|
|
|
|
|
initial_lr: 0.0005 |
|
|
final_lr: 0.00001 |
|
|
lr_wav2vec: 0.00002 |
|
|
|
|
|
weight_decay: 0.0 |
|
|
warmup_steps: 5000 |
|
|
augment_warmup: 7500 |
|
|
|
|
|
|
|
|
token_type: unigram |
|
|
character_coverage: 1.0 |
|
|
|
|
|
|
|
|
sample_rate: 16000 |
|
|
downsampling_factor: 5 |
|
|
|
|
|
|
|
|
|
|
|
dynamic_batching: true |
|
|
max_batch_length_train: 300 |
|
|
max_batch_length_val: 100 |
|
|
num_bucket: 200 |
|
|
shuffle: true |
|
|
batch_ordering: random |
|
|
max_batch_ex: 256 |
|
|
|
|
|
dynamic_batch_sampler_train: |
|
|
max_batch_length: 300 |
|
|
num_buckets: 200 |
|
|
shuffle: true |
|
|
batch_ordering: random |
|
|
max_batch_ex: 256 |
|
|
|
|
|
dynamic_batch_sampler_valid: |
|
|
max_batch_length: 100 |
|
|
num_buckets: 200 |
|
|
shuffle: true |
|
|
batch_ordering: random |
|
|
max_batch_ex: 256 |
|
|
|
|
|
|
|
|
train_dataloader_opts: |
|
|
batch_size: 32 |
|
|
shuffle: true |
|
|
num_workers: 4 |
|
|
collate_fn: !name:speechbrain.dataio.batch.PaddedBatch |
|
|
padding_kwargs: |
|
|
value: 49152 |
|
|
per_key_padding_kwargs: |
|
|
sig: |
|
|
value: 0 |
|
|
tokens_eos: |
|
|
value: -100 |
|
|
|
|
|
valid_dataloader_opts: |
|
|
batch_size: 8 |
|
|
collate_fn: !name:speechbrain.dataio.batch.PaddedBatch |
|
|
padding_kwargs: |
|
|
value: 49152 |
|
|
per_key_padding_kwargs: |
|
|
sig: |
|
|
value: 0 |
|
|
tokens_eos: |
|
|
value: -100 |
|
|
|
|
|
test_dataloader_opts: |
|
|
batch_size: 8 |
|
|
collate_fn: !name:speechbrain.dataio.batch.PaddedBatch |
|
|
padding_kwargs: |
|
|
value: 49152 |
|
|
per_key_padding_kwargs: |
|
|
sig: |
|
|
value: 0 |
|
|
tokens_eos: |
|
|
value: -100 |
|
|
|
|
|
|
|
|
|
|
|
activation: &id001 !name:speechbrain.nnet.activations.Swish |
|
|
asr_output_neurons: 1024 |
|
|
lora_rank: 16 |
|
|
|
|
|
|
|
|
dnn_layers: 2 |
|
|
dnn_neurons: 1024 |
|
|
downsampling_output_dim: 5120 |
|
|
avg_checkpoints: 3 |
|
|
|
|
|
|
|
|
blank_index: 0 |
|
|
pad_token: 49152 |
|
|
|
|
|
|
|
|
valid_search_interval: 4 |
|
|
valid_beam_size: 1 |
|
|
test_beam_size: 5 |
|
|
|
|
|
|
|
|
|
|
|
normalize: &id008 !new:speechbrain.processing.features.InputNormalization |
|
|
|
|
|
norm_type: sentence |
|
|
|
|
|
|
|
|
ssl: &id003 !new:speechbrain.integrations.huggingface.wav2vec2.Wav2Vec2 |
|
|
source: /localscratch/adelmou.48174029.0/wavlm-large/ |
|
|
output_norm: true |
|
|
freeze: true |
|
|
save_path: |
|
|
/scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//save/ssl_checkpoint |
|
|
device_map: cuda |
|
|
|
|
|
|
|
|
|
|
|
linear_projection: &id007 !new:torch.nn.Linear |
|
|
in_features: 5120 |
|
|
out_features: 2048 |
|
|
bias: false |
|
|
|
|
|
gated_nn: &id006 !new:speechbrain.lobes.models.GatedNN.GatedNN |
|
|
input_shape: [null, null, 2048] |
|
|
activation: *id001 |
|
|
blocks: 2 |
|
|
neurons: 1024 |
|
|
|
|
|
output_hidden_states: false |
|
|
backbone_llm: &id002 !new:speechbrain.integrations.huggingface.llama.LLaMA |
|
|
source: /localscratch/adelmou.48174029.0/SmolLM2-1.7B/ |
|
|
save_path: |
|
|
/scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//save |
|
|
freeze: true |
|
|
attn_implementation: flash_attention_2 |
|
|
device: cuda |
|
|
torch_dtype: !name:torch.bfloat16 |
|
|
additional_special_tokens: |
|
|
- <|start_of_audio|> |
|
|
- <|end_of_audio|> |
|
|
output_hidden_states: false |
|
|
|
|
|
|
|
|
llm: &id005 !new:speechbrain.nnet.adapters.AdaptedModel |
|
|
model_to_adapt: *id002 |
|
|
adapter_class: !name:speechbrain.nnet.adapters.LoRA |
|
|
all_linear: true |
|
|
adapter_kwargs: |
|
|
rank: 16 |
|
|
|
|
|
feat_downsampler: &id004 !new:speechbrain.lobes.downsampling.ConcatDownsampler |
|
|
downsampling_factor: 5 |
|
|
|
|
|
modules: |
|
|
ssl: *id003 |
|
|
feat_downsampler: *id004 |
|
|
llm: *id005 |
|
|
gated_nn: *id006 |
|
|
linear_projection: *id007 |
|
|
normalize: *id008 |
|
|
model: &id009 !new:torch.nn.ModuleList |
|
|
- [*id003, *id005, *id006, *id007] |
|
|
Adam: !name:torch.optim.AdamW |
|
|
lr: 0.0005 |
|
|
weight_decay: 0.0 |
|
|
|
|
|
Adam_wav2vec2: !name:torch.optim.AdamW |
|
|
lr: 0.00002 |
|
|
weight_decay: 0.0 |
|
|
|
|
|
|
|
|
scheduler: !new:speechbrain.nnet.schedulers.LinearScheduler |
|
|
initial_value: 0.0005 |
|
|
final_value: 0.00001 |
|
|
epoch_count: 20 |
|
|
|
|
|
lr_annealing_wav2vec: &id010 !new:speechbrain.nnet.schedulers.NewBobScheduler |
|
|
initial_value: 0.00002 |
|
|
improvement_threshold: 0.0025 |
|
|
annealing_factor: 0.8 |
|
|
patient: 1 |
|
|
|
|
|
|
|
|
checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer |
|
|
checkpoints_dir: |
|
|
/scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//save |
|
|
recoverables: |
|
|
model: *id009 |
|
|
lr_annealing_wav2vec: *id010 |
|
|
counter: &id011 !new:speechbrain.utils.epoch_loop.EpochCounter |
|
|
limit: 20 |
|
|
|
|
|
normalize: *id008 |
|
|
epoch_counter: *id011 |
|
|
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger |
|
|
save_file: |
|
|
/scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//train_log.txt |
|
|
|
|
|
cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats |
|
|
split_tokens: true |
|
|
error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats |
|
|
|