Built with Axolotl

See axolotl config

axolotl version: 0.13.0.dev0

# !pip install transformers==4.55.4
# !pip install --no-deps trl==0.22.2
# !pip install --no-build-isolation mamba_ssm==2.2.5
# !pip install --no-build-isolation causal_conv1d==1.5.2
# === Model Configuration ===
base_model: LatitudeGames/Muse-12B
load_in_8bit: false
load_in_4bit: true

# === HF Configuration === 
hub_model_id: ToastyPigeon/muse-marvin-32k-lora
hub_strategy: "every_save"
output_dir: ckpts-mmarv

# === Wandb Tracking ===
wandb_project: MuseMarvin
# wandb_entity: [WANDB_ENTITY]
wandb_name: r32-qlora-32k

# === Training Setup ===
num_epochs: 2
micro_batch_size: 1
gradient_accumulation_steps: 4
sequence_len: 32768
sequence_parallel_degree: 2
heads_k_stride: 1
sample_packing: true
#pad_to_sequence_len: true
#temperature: 0.7
#max_steps: 10
# === Evaluation ===
val_set_size: 0.025
evals_per_epoch: 10
#eval_steps: 20
#max_steps: 60
#eval_table_size:
eval_max_new_tokens: 128
#eval_sample_packing: true
#eval_strategy: "no"

# === LoRA Configuration ===
adapter: qlora
lora_model_dir:
lora_r: 32
lora_alpha: 32
lora_dropout: 0.1
lora_target_linear:
lora_target_modules:
#  - up_proj
#  - down_proj
#  - gate_proj
  - q_proj
  - v_proj
  - k_proj
#  - o_proj
#  - input_layernorm
#  - post_attention_layernorm
lora_fan_in_fan_out:
peft_use_rslora: false
#lora_modules_to_save:
#  - embed_tokens
#  - lm_head
#fix_untrained_tokens: true
#lora_mlp_kernel: true
#lora_qkv_kernel: true
#lora_o_kernel: true

# === Hyperparameter Configuration ===
#optimizer: apollo_adamw_layerwise
#warmup_steps: 0
warmup_ratio: 0.025
optimizer: adamw_torch_fused
#optimizer: paged_adamw_8bit
#optim_args:
#  enable_stochastic_rounding: true
#  enable_cautious: true
#  enable_8bit: true
# Apollo-mini configuration:
#optim_args: "proj=random,rank=128,scale=128.0,scale_type=tensor,update_proj_gap=100"
# Regular Apollo configuration:
# optim_args: 
#optim_target_modules: all_linear
learning_rate: 1e-5
lr_scheduler: cosine
#cosine_min_lr_ratio: 0.2
#lr_scheduler: cosine_with_min_lr
#lr_scheduler_kwargs:
#  cosine_min_lr: 1e-6
weight_decay: 0.01
max_grad_norm: 1.0
#warmup_steps: 0
#warmup_ratio: 0.025


# === Data Configuration ===
#
#chat_template: jinja
#chat_template: chatml
special_tokens:
#  eos_token: "<|im_end|>"
#  eos_token: "</s>"
#tokenizer_use_mistral_common: true
shuffle_merged_datasets: true
datasets:
#  - path: grimulkan/LimaRP-augmented
#    type: chat_template
#    field_messages: conversations
#    message_property_mappings:
#      role: from
#      content: value
#  - path: allenai/tulu-3-sft-personas-instruction-following
#    type: chat_template
#    split: train[:10%]
#  - path: ToastyPigeon/mixed-medical-reasoning-formatted
#    type: chat_template
#    data_files: mixed-medical-thinking.json
#    split: train[:10%]
  - path: ToastyPigeon/steve-and-marvin
    type: completion
    data_files: marvin.json
#  - path: ToastyPigeon/kimi-stories-completion
#    type: completion
#  - path: ToastyPigeon/new-story-dataset
 #   type: customcompletion-regex
#    type: completion
#    data_files: new-story-dataset-v2.json
#  - path: allura-org/fujin-instruct-v2
#    type: customchatml-regex
#    type: chat_template
#    field_messages: conversations
#    message_property_mappings:
#      role: from
#      content: value
#  - path: ToastyPigeon/some-rp-extended
 #   type: customchatml-regex
#    type: chat_template
#    field_messages: conversations
#    message_property_mappings:
#      role: from
#      content: value
#    roles_to_train: ["user","assistant"]
#  - path: ToastyPigeon/gutenberg-sft
#    type: customchatml-regex
#    type: chat_template
#    field_messages: conversations
#    message_property_mappings:
#      role: from
#      content: value
#  - path: ToastyPigeon/SpringDragon
#    type: customcompletion-regex
#    type: completion
#    split: train
#  - path: ToastyPigeon/some-erotica
#    type: customcompletion-regex
#    type: completion
#    split: train[:10%]

dataset_prepared_path: last_run_prepared


# === Plugins ===
plugins:
  - axolotl.integrations.liger.LigerPlugin
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

# === Hardware Optimization ===
#gradient_checkpointing: true
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
#liger_fused_linear_cross_entropy: true
cut_cross_entropy: true

#deepspeed: ../axolotl/deepspeed_configs/zero3_bf16_cpuoffload_params.json

# === FSDP Config === 
fsdp:
  - full_shard
  - auto_wrap
fsdp_config:
  fsdp_limit_all_gathers: true
  fsdp_sync_module_states: true
  fsdp_offload_params: true
  fsdp_activation_checkpointing: true
  fsdp_use_orig_params: false
  fsdp_cpu_ram_efficient_loading: true
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_transformer_layer_cls_to_wrap: MistralDecoderLayer
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD
#  fsdp_version: 2

# === Checkpointing ===
#save_steps: 10
saves_per_epoch: 2
save_total_limit: 1

# === Advanced Settings ===
bf16: auto
flash_attention: true
train_on_inputs: false
group_by_length: false
save_safetensors: true
logging_steps: 1
gc_steps: 10
seed: 69




muse-marvin-32k-lora

This model is a fine-tuned version of LatitudeGames/Muse-12B on the ToastyPigeon/steve-and-marvin dataset. It achieves the following results on the evaluation set:

  • Loss: 2.5071
  • Memory/max Active (gib): 4.98
  • Memory/max Allocated (gib): 4.89
  • Memory/device Reserved (gib): 6.9

Model description

More information needed

Intended uses & limitations

More information needed

Training and evaluation data

More information needed

Training procedure

Training hyperparameters

The following hyperparameters were used during training:

  • learning_rate: 1e-05
  • train_batch_size: 1
  • eval_batch_size: 1
  • seed: 69
  • distributed_type: multi-GPU
  • num_devices: 2
  • gradient_accumulation_steps: 4
  • total_train_batch_size: 8
  • total_eval_batch_size: 2
  • optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
  • lr_scheduler_type: cosine
  • lr_scheduler_warmup_steps: 10
  • training_steps: 420

Training results

Training Loss Epoch Step Validation Loss Active (gib) Allocated (gib) Reserved (gib)
No log 0 0 2.6286 8.04 6.73 8.36
2.4233 0.0993 21 2.6047 4.98 4.89 6.9
2.5581 0.1986 42 2.5627 4.98 4.89 6.9
2.3368 0.2979 63 2.5447 4.98 4.89 6.9
2.5579 0.3972 84 2.5328 4.98 4.89 6.9
2.4241 0.4965 105 2.5253 4.98 4.89 6.9
2.4608 0.5957 126 2.5199 4.98 4.89 6.9
2.8143 0.6950 147 2.5156 4.98 4.89 6.9
2.6305 0.7943 168 2.5129 4.98 4.89 6.9
2.3989 0.8936 189 2.5105 4.98 4.89 6.9
2.6816 0.9929 210 2.5096 4.98 4.89 6.9
2.629 1.0898 231 2.5092 4.98 4.89 6.9
2.4645 1.1891 252 2.5088 4.98 4.89 6.9
2.3738 1.2884 273 2.5081 4.98 4.89 6.9
2.3651 1.3877 294 2.5076 4.98 4.89 6.9
2.4476 1.4870 315 2.5073 4.98 4.89 6.9
2.4091 1.5863 336 2.5072 4.98 4.89 6.9
2.6352 1.6856 357 2.5071 4.98 4.89 6.9
2.5311 1.7849 378 2.5071 4.98 4.89 6.9
2.5747 1.8842 399 2.5071 4.98 4.89 6.9
2.3871 1.9835 420 2.5071 4.98 4.89 6.9

Framework versions

  • PEFT 0.17.1
  • Transformers 4.56.1
  • Pytorch 2.7.1+cu126
  • Datasets 4.0.0
  • Tokenizers 0.22.1
Downloads last month
-
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support

Model tree for ToastyPigeon/muse-marvin-32k-lora

Adapter
(6)
this model

Evaluation results