wavlm_large-smol2_1_7B_gateddnn_lora / hyperparams.yaml

Add files using upload-large-folder tool

be7aa66 verified about 2 months ago

7.87 kB

	# Generated 2025-08-16 from:
	# /home/adelmou/proj/speechbrain/speechllm_librispeech/speechbrain/recipes/LibriSpeech/ASR/transformer/hparams/llama.yaml
	# yamllint disable
	# ############################################################################
	# Authors: Adel Moumen
	# ############################################################################
	# Seed needs to be set at top of yaml, before objects with parameters are made
	seed: 3407
	__set_seed: !apply:speechbrain.utils.seed_everything [3407]
	experiment_name: llama_3.2_1b_ASR
	output_folder: /scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora/
	output_wer_folder:
	/scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//wer_results
	save_folder:
	/scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//save
	train_log:
	/scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//train_log.txt


	# Data files
	data_folder: /localscratch/adelmou.48174029.0/LibriSpeech/
	# e.g., /path/to/LibriSpeech
	# If RIRS_NOISES dir exists in /localscratch/xxx_corpus/RIRS_NOISES
	# then data_folder_rirs should be /localscratch/xxx_corpus
	# otherwise the dataset will automatically be downloaded
	# data_folder_rirs: !ref <data_folder>
	train_splits: [train-clean-100, train-clean-360, train-other-500] # , "train-clean-360", "train-other-500"
	dev_splits: [dev-clean]
	test_splits: [test-clean, test-other]
	skip_prep: false
	train_csv:
	/scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//train.csv
	valid_csv:
	/scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//dev-clean.csv
	test_csv:
	- /scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//test-clean.csv
	- /scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//test-other.csv

	ckpt_interval_minutes: 15 # save checkpoint every N min

	####################### Training Parameters ####################################

	# URL for the HuggingFace model we want to load (BASE here)
	ssl_hub: /localscratch/adelmou.48174029.0/wavlm-large/
	ssl_folder:
	/scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//save/ssl_checkpoint
	ssl_frozen: true

	# LLM options
	llm_path: /localscratch/adelmou.48174029.0/SmolLM2-1.7B/
	llm_emb_size: 2048

	number_of_epochs: 20
	batch_size: 32 # Only used if dynamic batching is off.
	# 400s * 10 => 1h / opt step
	grad_accumulation_factor: 5
	loss_reduction: batchmean
	sorting: random
	num_workers: 4
	precision: bf16 # bf16, fp16 or fp32
	eval_precision: bf16
	max_grad_norm: 1.0

	# stages related parameters
	initial_lr: 0.0005
	final_lr: 0.00001
	lr_wav2vec: 0.00002

	weight_decay: 0.0
	warmup_steps: 5000
	augment_warmup: 7500

	# BPE parameters
	token_type: unigram # ["unigram", "bpe", "char"]
	character_coverage: 1.0

	# Feature parameters
	sample_rate: 16000
	downsampling_factor: 5 # Used to downsample frames before llm projection.

	# This setup works well for A100 80GB GPU, adapts it to your needs.
	# Or turn it off (but training speed will decrease)
	dynamic_batching: true
	max_batch_length_train: 300
	max_batch_length_val: 100 # we reduce it as the beam is much wider (VRAM)
	num_bucket: 200
	shuffle: true # if true re-creates batches at each epoch shuffling examples.
	batch_ordering: random
	max_batch_ex: 256

	dynamic_batch_sampler_train:
	max_batch_length: 300
	num_buckets: 200
	shuffle: true
	batch_ordering: random
	max_batch_ex: 256

	dynamic_batch_sampler_valid:
	max_batch_length: 100
	num_buckets: 200
	shuffle: true
	batch_ordering: random
	max_batch_ex: 256

	# Dataloader options
	train_dataloader_opts:
	batch_size: 32
	shuffle: true
	num_workers: 4
	collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
	padding_kwargs:
	value: 49152
	per_key_padding_kwargs:
	sig:
	value: 0
	tokens_eos:
	value: -100

	valid_dataloader_opts:
	batch_size: 8
	collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
	padding_kwargs:
	value: 49152
	per_key_padding_kwargs:
	sig:
	value: 0
	tokens_eos:
	value: -100

	test_dataloader_opts:
	batch_size: 8
	collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
	padding_kwargs:
	value: 49152
	per_key_padding_kwargs:
	sig:
	value: 0
	tokens_eos:
	value: -100


	####################### Model Parameters ###########################
	activation: &id001 !name:speechbrain.nnet.activations.Swish
	asr_output_neurons: 1024
	lora_rank: 16

	# Frames - LLM projector params
	dnn_layers: 2
	dnn_neurons: 1024
	downsampling_output_dim: 5120
	avg_checkpoints: 3

	# Outputs
	blank_index: 0
	pad_token: 49152 #Llama 3 pad index after adding: BEURK.

	# Decoding parameters
	valid_search_interval: 4
	valid_beam_size: 1 # We do greedy here so it's faster to decode ...
	test_beam_size: 5

	############################## models ################################

	normalize: &id008 !new:speechbrain.processing.features.InputNormalization

	norm_type: sentence

	#wav2vec model
	ssl: &id003 !new:speechbrain.integrations.huggingface.wav2vec2.Wav2Vec2
	source: /localscratch/adelmou.48174029.0/wavlm-large/
	output_norm: true
	freeze: true
	save_path:
	/scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//save/ssl_checkpoint
	device_map: cuda
	# attn_implementation: sdpa
	# normalize_wav: False

	linear_projection: &id007 !new:torch.nn.Linear
	in_features: 5120
	out_features: 2048
	bias: false

	gated_nn: &id006 !new:speechbrain.lobes.models.GatedNN.GatedNN
	input_shape: [null, null, 2048] # 5 x 1024
	activation: *id001
	blocks: 2
	neurons: 1024

	output_hidden_states: false
	backbone_llm: &id002 !new:speechbrain.integrations.huggingface.llama.LLaMA
	source: /localscratch/adelmou.48174029.0/SmolLM2-1.7B/
	save_path:
	/scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//save
	freeze: true
	attn_implementation: flash_attention_2
	device: cuda
	torch_dtype: !name:torch.bfloat16
	additional_special_tokens:
	- <\|start_of_audio\|>
	- <\|end_of_audio\|>
	output_hidden_states: false

	# Simply uncomment if you want to use LoRA adaptation.
	llm: &id005 !new:speechbrain.nnet.adapters.AdaptedModel
	model_to_adapt: *id002
	adapter_class: !name:speechbrain.nnet.adapters.LoRA
	all_linear: true
	adapter_kwargs:
	rank: 16

	feat_downsampler: &id004 !new:speechbrain.lobes.downsampling.ConcatDownsampler
	downsampling_factor: 5

	modules:
	ssl: *id003
	feat_downsampler: *id004
	llm: *id005
	gated_nn: *id006
	linear_projection: *id007
	normalize: *id008
	model: &id009 !new:torch.nn.ModuleList
	- [id003, id005, id006, id007]
	Adam: !name:torch.optim.AdamW
	lr: 0.0005
	weight_decay: 0.0

	Adam_wav2vec2: !name:torch.optim.AdamW
	lr: 0.00002
	weight_decay: 0.0


	scheduler: !new:speechbrain.nnet.schedulers.LinearScheduler
	initial_value: 0.0005
	final_value: 0.00001
	epoch_count: 20

	lr_annealing_wav2vec: &id010 !new:speechbrain.nnet.schedulers.NewBobScheduler
	initial_value: 0.00002
	improvement_threshold: 0.0025
	annealing_factor: 0.8
	patient: 1


	checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
	checkpoints_dir:
	/scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//save
	recoverables:
	model: *id009
	lr_annealing_wav2vec: *id010
	counter: &id011 !new:speechbrain.utils.epoch_loop.EpochCounter
	limit: 20

	normalize: *id008
	epoch_counter: *id011
	train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
	save_file:
	/scratch/adelmou/speechllm_results_ls/wavlm_large+smollm2_1.7B_gateddnn_lora//train_log.txt

	cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats
	split_tokens: true
	error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats