Koto-22B-PT
2
8
22.0B
license:apache-2.0
by
allura-org
Other
OTHER
22B params
New
2 downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
50GB+ RAM
Mobile
Laptop
Server
Quick Summary
Koto-22B-PT is a depth-upscaled version of Mistral-Nemo-Base-2407, healed and trained on almost a billion tokens of creative writing data.
Device Compatibility
Mobile
4-6GB RAM
Laptop
16GB RAM
Server
GPU
Minimum Recommended
21GB+ RAM
Code Examples
Axolotl Configyaml
## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true
## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false
## data
datasets:
datasets:
- path: estrogen/bookscpt2
type: completion
field: text
shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain
## Liger + CCE
plugins:
- axolotl.integrations.liger.LigerPlugin
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true
## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true
## max grad norm
max_grad_norm: 1.0
## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:
## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5
train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false
gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:
warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
pad_token: <pad>Axolotl Configyaml
## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true
## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false
## data
datasets:
datasets:
- path: estrogen/bookscpt2
type: completion
field: text
shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain
## Liger + CCE
plugins:
- axolotl.integrations.liger.LigerPlugin
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true
## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true
## max grad norm
max_grad_norm: 1.0
## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:
## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5
train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false
gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:
warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
pad_token: <pad>Axolotl Configyaml
## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true
## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false
## data
datasets:
datasets:
- path: estrogen/bookscpt2
type: completion
field: text
shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain
## Liger + CCE
plugins:
- axolotl.integrations.liger.LigerPlugin
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true
## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true
## max grad norm
max_grad_norm: 1.0
## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:
## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5
train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false
gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:
warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
pad_token: <pad>Axolotl Configyaml
## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true
## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false
## data
datasets:
datasets:
- path: estrogen/bookscpt2
type: completion
field: text
shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain
## Liger + CCE
plugins:
- axolotl.integrations.liger.LigerPlugin
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true
## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true
## max grad norm
max_grad_norm: 1.0
## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:
## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5
train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false
gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:
warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
pad_token: <pad>Axolotl Configyaml
## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true
## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false
## data
datasets:
datasets:
- path: estrogen/bookscpt2
type: completion
field: text
shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain
## Liger + CCE
plugins:
- axolotl.integrations.liger.LigerPlugin
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true
## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true
## max grad norm
max_grad_norm: 1.0
## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:
## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5
train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false
gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:
warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
pad_token: <pad>Axolotl Configyaml
## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true
## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false
## data
datasets:
datasets:
- path: estrogen/bookscpt2
type: completion
field: text
shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain
## Liger + CCE
plugins:
- axolotl.integrations.liger.LigerPlugin
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true
## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true
## max grad norm
max_grad_norm: 1.0
## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:
## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5
train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false
gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:
warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
pad_token: <pad>Axolotl Configyaml
## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true
## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false
## data
datasets:
datasets:
- path: estrogen/bookscpt2
type: completion
field: text
shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain
## Liger + CCE
plugins:
- axolotl.integrations.liger.LigerPlugin
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true
## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true
## max grad norm
max_grad_norm: 1.0
## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:
## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5
train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false
gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:
warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
pad_token: <pad>Axolotl Configyaml
## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true
## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false
## data
datasets:
datasets:
- path: estrogen/bookscpt2
type: completion
field: text
shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain
## Liger + CCE
plugins:
- axolotl.integrations.liger.LigerPlugin
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true
## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true
## max grad norm
max_grad_norm: 1.0
## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:
## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5
train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false
gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:
warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
pad_token: <pad>Axolotl Configyaml
## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true
## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false
## data
datasets:
datasets:
- path: estrogen/bookscpt2
type: completion
field: text
shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain
## Liger + CCE
plugins:
- axolotl.integrations.liger.LigerPlugin
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true
## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true
## max grad norm
max_grad_norm: 1.0
## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:
## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5
train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false
gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:
warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
pad_token: <pad>Axolotl Configyaml
## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true
## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false
## data
datasets:
datasets:
- path: estrogen/bookscpt2
type: completion
field: text
shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain
## Liger + CCE
plugins:
- axolotl.integrations.liger.LigerPlugin
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true
## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true
## max grad norm
max_grad_norm: 1.0
## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:
## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5
train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false
gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:
warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
pad_token: <pad>Axolotl Configyaml
## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true
## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false
## data
datasets:
datasets:
- path: estrogen/bookscpt2
type: completion
field: text
shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain
## Liger + CCE
plugins:
- axolotl.integrations.liger.LigerPlugin
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true
## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true
## max grad norm
max_grad_norm: 1.0
## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:
## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5
train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false
gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:
warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
pad_token: <pad>Axolotl Configyaml
## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true
## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false
## data
datasets:
datasets:
- path: estrogen/bookscpt2
type: completion
field: text
shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain
## Liger + CCE
plugins:
- axolotl.integrations.liger.LigerPlugin
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true
## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true
## max grad norm
max_grad_norm: 1.0
## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:
## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5
train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false
gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:
warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
pad_token: <pad>Axolotl Configyaml
## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true
## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false
## data
datasets:
datasets:
- path: estrogen/bookscpt2
type: completion
field: text
shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain
## Liger + CCE
plugins:
- axolotl.integrations.liger.LigerPlugin
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true
## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true
## max grad norm
max_grad_norm: 1.0
## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:
## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5
train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false
gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:
warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
pad_token: <pad>Axolotl Configyaml
## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true
## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false
## data
datasets:
datasets:
- path: estrogen/bookscpt2
type: completion
field: text
shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain
## Liger + CCE
plugins:
- axolotl.integrations.liger.LigerPlugin
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true
## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true
## max grad norm
max_grad_norm: 1.0
## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:
## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5
train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false
gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:
warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
pad_token: <pad>Axolotl Configyaml
## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true
## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false
## data
datasets:
datasets:
- path: estrogen/bookscpt2
type: completion
field: text
shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain
## Liger + CCE
plugins:
- axolotl.integrations.liger.LigerPlugin
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true
## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true
## max grad norm
max_grad_norm: 1.0
## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:
## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5
train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false
gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:
warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
pad_token: <pad>Axolotl Configyaml
## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true
## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false
## data
datasets:
datasets:
- path: estrogen/bookscpt2
type: completion
field: text
shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain
## Liger + CCE
plugins:
- axolotl.integrations.liger.LigerPlugin
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true
## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true
## max grad norm
max_grad_norm: 1.0
## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:
## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5
train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false
gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:
warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
pad_token: <pad>Axolotl Configyaml
## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true
## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false
## data
datasets:
datasets:
- path: estrogen/bookscpt2
type: completion
field: text
shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain
## Liger + CCE
plugins:
- axolotl.integrations.liger.LigerPlugin
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true
## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true
## max grad norm
max_grad_norm: 1.0
## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:
## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5
train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false
gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:
warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
pad_token: <pad>Axolotl Configyaml
## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true
## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false
## data
datasets:
datasets:
- path: estrogen/bookscpt2
type: completion
field: text
shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain
## Liger + CCE
plugins:
- axolotl.integrations.liger.LigerPlugin
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true
## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true
## max grad norm
max_grad_norm: 1.0
## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:
## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5
train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false
gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:
warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
pad_token: <pad>Axolotl Configyaml
## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true
## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false
## data
datasets:
datasets:
- path: estrogen/bookscpt2
type: completion
field: text
shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain
## Liger + CCE
plugins:
- axolotl.integrations.liger.LigerPlugin
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true
## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true
## max grad norm
max_grad_norm: 1.0
## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:
## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5
train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false
gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:
warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
pad_token: <pad>Axolotl Configyaml
## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true
## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false
## data
datasets:
datasets:
- path: estrogen/bookscpt2
type: completion
field: text
shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain
## Liger + CCE
plugins:
- axolotl.integrations.liger.LigerPlugin
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true
## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true
## max grad norm
max_grad_norm: 1.0
## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:
## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5
train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false
gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:
warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
pad_token: <pad>Axolotl Configyaml
## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true
## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false
## data
datasets:
datasets:
- path: estrogen/bookscpt2
type: completion
field: text
shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain
## Liger + CCE
plugins:
- axolotl.integrations.liger.LigerPlugin
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true
## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true
## max grad norm
max_grad_norm: 1.0
## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:
## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5
train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false
gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:
warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
pad_token: <pad>Axolotl Configyaml
## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true
## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false
## data
datasets:
datasets:
- path: estrogen/bookscpt2
type: completion
field: text
shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain
## Liger + CCE
plugins:
- axolotl.integrations.liger.LigerPlugin
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true
## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true
## max grad norm
max_grad_norm: 1.0
## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:
## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5
train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false
gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:
warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
pad_token: <pad>Axolotl Configyaml
## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true
## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false
## data
datasets:
datasets:
- path: estrogen/bookscpt2
type: completion
field: text
shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain
## Liger + CCE
plugins:
- axolotl.integrations.liger.LigerPlugin
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true
## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true
## max grad norm
max_grad_norm: 1.0
## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:
## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5
train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false
gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:
warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
pad_token: <pad>Axolotl Configyaml
## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true
## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false
## data
datasets:
datasets:
- path: estrogen/bookscpt2
type: completion
field: text
shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain
## Liger + CCE
plugins:
- axolotl.integrations.liger.LigerPlugin
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true
## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true
## max grad norm
max_grad_norm: 1.0
## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:
## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5
train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false
gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:
warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
pad_token: <pad>Deploy This Model
Production-ready deployment in minutes
Together.ai
Instant API access to this model
Production-ready inference API. Start free, scale to millions.
Try Free APIReplicate
One-click model deployment
Run models in the cloud with simple API. No DevOps required.
Deploy NowDisclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.