data: data_path: ../data/msmarco/parsed_hard_ids_10p_train.jsonl dataset_seed: 42 max_block_length: 160 max_seq_length: 6144 num_documents: 30 qrels_path: null streaming: false train_test_split: 0.99 val_data_path: null model: attn_implementation: default_blockrank lora_alpha: -1 lora_dropout: 0.0 lora_r: -1 lora_target_modules: q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj model_name_or_path: /workspace/nilesh_work/hf_cache/Mistral-7B-Instruct-v0.3 trust_remote_code: false use_4bit: false use_blockrank: true use_lora: false training: accelerator_config: dispatch_batches: null even_batches: true gradient_accumulation_kwargs: null non_blocking: false split_batches: false use_seedable_sampler: true activation_offloading: false adafactor: false adam_beta1: 0.9 adam_beta2: 0.999 adam_epsilon: 1.0e-08 assistant_only_loss: false auto_find_batch_size: false aux_layer_idx: 20 aux_loss_weight: 0.1 aux_temperature: 0.05 average_tokens_across_devices: true batch_eval_metrics: false bf16: true bf16_full_eval: false chat_template_path: null completion_only_loss: null data_seed: null dataloader_drop_last: false dataloader_num_workers: 0 dataloader_persistent_workers: false dataloader_pin_memory: true dataloader_prefetch_factor: null dataset_kwargs: skip_prepare_dataset: true dataset_num_proc: null dataset_text_field: text ddp_backend: null ddp_broadcast_buffers: null ddp_bucket_cap_mb: null ddp_find_unused_parameters: null ddp_timeout: 1800 debug: [] deepspeed: null disable_tqdm: false do_eval: true do_predict: false do_train: false eos_token: eval_accumulation_steps: null eval_delay: 0 eval_do_concat_batches: true eval_on_start: false eval_packing: null eval_steps: 500 eval_strategy: 'no' eval_use_gather_object: false evaluation_strategy: steps fp16: false fp16_backend: auto fp16_full_eval: false fp16_opt_level: O1 fsdp: [] fsdp_config: min_num_params: 0 xla: false xla_fsdp_grad_ckpt: false xla_fsdp_v2: false fsdp_min_num_params: 0 fsdp_transformer_layer_cls_to_wrap: null full_determinism: false gradient_accumulation_steps: 4 gradient_checkpointing: true gradient_checkpointing_kwargs: null greater_is_better: false group_by_length: false half_precision_backend: auto hub_always_push: false hub_model_id: null hub_private_repo: null hub_revision: null hub_strategy: every_save hub_token: ignore_data_skip: false include_for_metrics: [] include_inputs_for_metrics: false include_num_input_tokens_seen: 'no' include_tokens_per_second: false jit_mode_eval: false label_names: null label_smoothing_factor: 0.0 learning_rate: 3.0e-06 length_column_name: length liger_kernel_config: null load_best_model_at_end: false local_rank: 0 log_level: passive log_level_replica: warning log_on_each_node: true logging_dir: ../outputs/blockrank-with-aux-loss-mistral-7b-icr-medium_hard_ids-full/runs/Nov03_04-06-43_06353250b0cb logging_first_step: true logging_nan_inf_filter: true logging_steps: 25 logging_strategy: steps loss_type: nll lr_scheduler_kwargs: {} lr_scheduler_type: cosine max_grad_norm: 1.0 max_length: 1024 max_steps: -1 metric_for_best_model: eval_loss model_init_kwargs: null mp_parameters: '' neftune_noise_alpha: null no_cuda: false num_train_epochs: 1 optim: adamw_8bit optim_args: null optim_target_modules: null output_dir: ../outputs/blockrank-with-aux-loss-mistral-7b-icr-medium_hard_ids-full overwrite_output_dir: false packing: false packing_strategy: bfd pad_to_multiple_of: null pad_token: padding_free: false parallelism_config: null past_index: -1 per_device_eval_batch_size: 1 per_device_train_batch_size: 1 per_gpu_eval_batch_size: null per_gpu_train_batch_size: null prediction_loss_only: false project: huggingface push_to_hub: false push_to_hub_model_id: null push_to_hub_organization: null push_to_hub_token: ray_scope: last remove_unused_columns: false report_to: - wandb restore_callback_states_from_checkpoint: false resume_from_checkpoint: null run_name: blockrank-with-aux-loss-mistral-7b-icr-medium_hard_ids-full save_on_each_node: false save_only_model: false save_safetensors: true save_steps: 500 save_strategy: 'no' save_total_limit: 1 seed: 42 skip_memory_metrics: true tf32: null torch_compile: false torch_compile_backend: null torch_compile_mode: null torch_empty_cache_steps: null torchdynamo: null tpu_metrics_debug: false tpu_num_cores: null trackio_space_id: trackio use_aux_loss: true use_cpu: false use_legacy_prediction_loop: false use_liger_kernel: false use_mps_device: false warmup_ratio: 0.01 warmup_steps: 0 weight_decay: 0