Built with Axolotl

See axolotl config

axolotl version: 0.13.0.dev0

# !pip install transformers==4.55.4
# !pip install --no-deps trl==0.22.2
# !pip install --no-build-isolation mamba_ssm==2.2.5
# !pip install --no-build-isolation causal_conv1d==1.5.2
# === Model Configuration ===
base_model: LatitudeGames/Muse-12B
load_in_8bit: false
load_in_4bit: true

# === HF Configuration === 
hub_model_id: ToastyPigeon/muse-marvin-ffn-lora
hub_strategy: "every_save"
output_dir: ckpts-mmarv

# === Training Setup ===
num_epochs: 1
micro_batch_size: 1
gradient_accumulation_steps: 4
sequence_len: 16384
#sequence_parallel_degree: 2
#heads_k_stride: 1
sample_packing: true
pad_to_sequence_len: true
#temperature: 0.7
#max_steps: 10
# === Evaluation ===
val_set_size: 0.025
evals_per_epoch: 10
#eval_steps: 20
#max_steps: 60
#eval_table_size:
eval_max_new_tokens: 128
#eval_sample_packing: true
#eval_strategy: "no"

# === LoRA Configuration ===
adapter: qlora
lora_model_dir:
lora_r: 32
lora_alpha: 32
lora_dropout: 0.1
lora_target_linear:
lora_target_modules:
  - up_proj
  - down_proj
  - gate_proj
#  - q_proj
#  - v_proj
#  - k_proj
#  - o_proj
lora_fan_in_fan_out:
peft_use_rslora: false
#lora_modules_to_save:
#  - embed_tokens
#  - lm_head
#fix_untrained_tokens: true
#lora_mlp_kernel: true
#lora_qkv_kernel: true
#lora_o_kernel: true

# === Hyperparameter Configuration ===
#optimizer: apollo_adamw_layerwise
#warmup_steps: 0
warmup_ratio: 0.025
optimizer: adamw_torch_fused
#optimizer: paged_adamw_8bit
#optim_args:
#  enable_stochastic_rounding: true
#  enable_cautious: true
#  enable_8bit: true
# Apollo-mini configuration:
#optim_args: "proj=random,rank=128,scale=128.0,scale_type=tensor,update_proj_gap=100"
# Regular Apollo configuration:
# optim_args: 
#optim_target_modules: all_linear
learning_rate: 1e-5
lr_scheduler: cosine
#cosine_min_lr_ratio: 0.2
#lr_scheduler: cosine_with_min_lr
#lr_scheduler_kwargs:
#  cosine_min_lr: 1e-6
weight_decay: 0.01
max_grad_norm: 1.0
#warmup_steps: 0
#warmup_ratio: 0.025


# === Data Configuration ===
#
#chat_template: jinja
#chat_template: chatml
special_tokens:
#  eos_token: "<|im_end|>"
#  eos_token: "</s>"
#tokenizer_use_mistral_common: true
shuffle_merged_datasets: true
datasets:
  - path: grimulkan/LimaRP-augmented
    type: chat_template
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value
#  - path: allenai/tulu-3-sft-personas-instruction-following
#    type: chat_template
#    split: train[:10%]
#  - path: ToastyPigeon/mixed-medical-reasoning-formatted
#    type: chat_template
#    data_files: mixed-medical-thinking.json
#    split: train[:10%]
  - path: ToastyPigeon/steve-and-marvin
    type: completion
    data_files: marvin.json
  - path: ToastyPigeon/kimi-stories-completion
    type: completion
#  - path: ToastyPigeon/new-story-dataset
 #   type: customcompletion-regex
#    type: completion
#    data_files: new-story-dataset-v2.json
#  - path: allura-org/fujin-instruct-v2
#    type: customchatml-regex
#    type: chat_template
#    field_messages: conversations
#    message_property_mappings:
#      role: from
#      content: value
#  - path: ToastyPigeon/some-rp-extended
 #   type: customchatml-regex
#    type: chat_template
#    field_messages: conversations
#    message_property_mappings:
#      role: from
#      content: value
#    roles_to_train: ["user","assistant"]
#  - path: ToastyPigeon/gutenberg-sft
#    type: customchatml-regex
#    type: chat_template
#    field_messages: conversations
#    message_property_mappings:
#      role: from
#      content: value
#  - path: ToastyPigeon/SpringDragon
#    type: customcompletion-regex
#    type: completion
#    split: train
#  - path: ToastyPigeon/some-erotica
#    type: customcompletion-regex
#    type: completion
#    split: train[:10%]

dataset_prepared_path: last_run_prepared


# === Plugins ===
plugins:
  - axolotl.integrations.liger.LigerPlugin
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

# === Hardware Optimization ===
#gradient_checkpointing: true
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
#liger_fused_linear_cross_entropy: true
cut_cross_entropy: true

#deepspeed: ../axolotl/deepspeed_configs/zero3_bf16_cpuoffload_params.json

# === FSDP Config === 
fsdp:
  - full_shard
  - auto_wrap
fsdp_config:
  fsdp_limit_all_gathers: true
  fsdp_sync_module_states: true
  fsdp_offload_params: true
  fsdp_activation_checkpointing: true
  fsdp_use_orig_params: false
  fsdp_cpu_ram_efficient_loading: true
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_transformer_layer_cls_to_wrap: MistralDecoderLayer
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD
#  fsdp_version: 2
# === Wandb Tracking ===
wandb_project: MuseMarvin
# wandb_entity: [WANDB_ENTITY]
wandb_name: r32-qlora-ffn

# === Checkpointing ===
#save_steps: 10
saves_per_epoch: 10
save_total_limit: 1

# === Advanced Settings ===
bf16: auto
flash_attention: true
train_on_inputs: false
group_by_length: false
save_safetensors: true
logging_steps: 1
gc_steps: 10
seed: 69




muse-marvin-ffn-lora

This model is a fine-tuned version of LatitudeGames/Muse-12B on the grimulkan/LimaRP-augmented, the ToastyPigeon/steve-and-marvin and the ToastyPigeon/kimi-stories-completion datasets. It achieves the following results on the evaluation set:

  • Loss: 2.4226
  • Memory/max Active (gib): 5.33
  • Memory/max Allocated (gib): 5.32
  • Memory/device Reserved (gib): 7.14

Model description

More information needed

Intended uses & limitations

More information needed

Training and evaluation data

More information needed

Training procedure

Training hyperparameters

The following hyperparameters were used during training:

  • learning_rate: 1e-05
  • train_batch_size: 1
  • eval_batch_size: 1
  • seed: 69
  • distributed_type: multi-GPU
  • num_devices: 2
  • gradient_accumulation_steps: 4
  • total_train_batch_size: 8
  • total_eval_batch_size: 2
  • optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
  • lr_scheduler_type: cosine
  • lr_scheduler_warmup_steps: 5
  • training_steps: 232

Training results

Training Loss Epoch Step Validation Loss Active (gib) Allocated (gib) Reserved (gib)
No log 0 0 2.5323 8.04 6.73 8.36
2.5899 0.1032 24 2.4883 5.33 5.32 7.14
2.4116 0.2065 48 2.4515 5.33 5.32 7.14
2.367 0.3097 72 2.4390 5.33 5.32 7.14
2.2949 0.4129 96 2.4321 5.33 5.32 7.14
2.5007 0.5161 120 2.4277 5.33 5.32 7.14
2.682 0.6194 144 2.4251 5.33 5.32 7.14
2.317 0.7226 168 2.4235 5.33 5.32 7.14
2.3376 0.8258 192 2.4228 5.33 5.32 7.14
2.5941 0.9290 216 2.4226 5.33 5.32 7.14

Framework versions

  • PEFT 0.17.1
  • Transformers 4.56.1
  • Pytorch 2.7.1+cu126
  • Datasets 4.0.0
  • Tokenizers 0.22.1
Downloads last month
1
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support

Model tree for ToastyPigeon/muse-marvin-ffn-lora

Adapter
(6)
this model

Datasets used to train ToastyPigeon/muse-marvin-ffn-lora