| | run_name: AMD-OLMo-1B-SFT-2nd-phase |
| | seed: 6198 |
| | dry_run: false |
| |
|
| | wandb: |
| | name: ${run_name} |
| | project: AMD-OLMo |
| | group: SFT |
| |
|
| | model: |
| | d_model: 2048 |
| | n_heads: 16 |
| | n_layers: 16 |
| | mlp_ratio: 8 |
| | weight_tying: true |
| | alibi: false |
| | rope: true |
| | flash_attention: false |
| | attention_dropout: 0.0 |
| | attention_layer_norm: false |
| | multi_query_attention: false |
| | include_bias: false |
| | block_type: sequential |
| | layer_norm_type: default |
| | layer_norm_with_affine: false |
| | bias_for_layer_norm: false |
| | attention_layer_norm_with_affine: false |
| | activation_type: swiglu |
| | residual_dropout: 0.0 |
| | embedding_dropout: 0.0 |
| | max_sequence_length: 2048 |
| | vocab_size: 50280 |
| | embedding_size: 50304 |
| | eos_token_id: 50279 |
| | pad_token_id: 1 |
| | init_device: meta |
| | init_fn: mitchell |
| |
|
| | compile: |
| | fullgraph: false |
| |
|
| | optimizer: |
| | name: adamw |
| | learning_rate: 2.0e-5 |
| | weight_decay: 0 |
| | betas: |
| | - 0.9 |
| | - 0.95 |
| | metrics_log_interval: 10 |
| |
|
| | scheduler: |
| | name: linear_with_warmup |
| | t_warmup: 200 |
| | alpha_f: 0.001 |
| |
|
| | tokenizer: |
| | identifier: tokenizers/allenai_eleuther-ai-gpt-neox-20b-pii-special.json |
| | truncate_direction: right |
| |
|
| | save_folder: ./outputs/${run_name}/ |
| | save_overwrite: true |
| | |
| | save_interval: 1000 |
| | save_num_checkpoints_to_keep: -1 |
| | |
| | save_interval_unsharded: 10000 |
| | save_num_unsharded_checkpoints_to_keep: -1 |
| |
|
| | load_path: path_to_unsharded_1st_phase_SFT_checkpoint |
| | reset_trainer_state: true |
| |
|
| | max_duration: 3ep |
| | global_train_batch_size: 512 |
| | device_train_microbatch_size: 8 |
| |
|
| | precision: amp_bf16 |
| |
|
| | fsdp: |
| | wrapping_strategy: null |
| | precision: mixed |
| |
|
| | max_grad_norm: 1.0 |
| | max_grad_norm_ratio: null |
| |
|
| | speed_monitor: |
| | window_size: 20 |
| |
|
| | eval_interval: ${save_interval} |
| | eval_subset_num_batches: -1 |
| | device_eval_batch_size: ${device_train_microbatch_size} |
| | evaluators: |
| | - label: piqa |
| | type: downstream |
| |
|
| | - label: hellaswag |
| | type: downstream |
| |
|
| | - label: winogrande |
| | type: downstream |
| |
|
| | - label: openbook_qa |
| | type: downstream |
| |
|
| | |
| | |
| |
|
| | - label: sciq |
| | type: downstream |
| |
|
| | - label: arc_easy |
| | type: downstream |
| |
|
| | |
| | |
| |
|
| | - label: copa |
| | type: downstream |
| |
|
| | - label: rte |
| | type: downstream |
| |
|
| | - label: commitment_bank |
| | type: downstream |
| |
|
| | - label: mrpc |
| | type: downstream |
| |
|
| | - label: sst2 |
| | type: downstream |
| |
|
| | data: |
| | pad_direction: right |
| | num_workers: 0 |
| | drop_last: true |
| | pin_memory: true |
| | prefetch_factor: 1 |
| | persistent_workers: true |
| | timeout: 0 |
| | generate_attention_mask: true |
| | paths: |
| | - ./datasets/OpenHermes_WebInstructSub_CodeFeedBack/input_ids.npy |
| | label_mask_paths: |
| | - ./datasets/OpenHermes_WebInstructSub_CodeFeedBack/label_mask.npy |