Koto-22B-PT

Name: Koto-22B-PT
Author: allura-org

22.0B

license:apache-2.0

allura-org

Other

OTHER

22B params

New

2 downloads

Early-stage

Try on Hugging Face Add to Compare

Edge AI:

Mobile

Laptop

Server

50GB+ RAM

Mobile

Laptop

Server

Quick Summary

Koto-22B-PT is a depth-upscaled version of Mistral-Nemo-Base-2407, healed and trained on almost a billion tokens of creative writing data.

Device Compatibility

Mobile

4-6GB RAM

Laptop

16GB RAM

Server

GPU

Minimum Recommended

21GB+ RAM

Code Examples

Axolotl Configyaml

## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true

## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false

## data 
datasets:
datasets:
  - path: estrogen/bookscpt2
    type: completion
    field: text


shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain

## Liger + CCE
plugins:
  - axolotl.integrations.liger.LigerPlugin
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true

## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true

## max grad norm
max_grad_norm: 1.0


## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:

## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5

train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false

gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:

warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
   pad_token: <pad>

Axolotl Configyaml

## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true

## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false

## data 
datasets:
datasets:
  - path: estrogen/bookscpt2
    type: completion
    field: text


shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain

## Liger + CCE
plugins:
  - axolotl.integrations.liger.LigerPlugin
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true

## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true

## max grad norm
max_grad_norm: 1.0


## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:

## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5

train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false

gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:

warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
   pad_token: <pad>

Axolotl Configyaml

## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true

## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false

## data 
datasets:
datasets:
  - path: estrogen/bookscpt2
    type: completion
    field: text


shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain

## Liger + CCE
plugins:
  - axolotl.integrations.liger.LigerPlugin
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true

## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true

## max grad norm
max_grad_norm: 1.0


## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:

## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5

train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false

gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:

warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
   pad_token: <pad>

Axolotl Configyaml

## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true

## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false

## data 
datasets:
datasets:
  - path: estrogen/bookscpt2
    type: completion
    field: text


shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain

## Liger + CCE
plugins:
  - axolotl.integrations.liger.LigerPlugin
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true

## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true

## max grad norm
max_grad_norm: 1.0


## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:

## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5

train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false

gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:

warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
   pad_token: <pad>

Axolotl Configyaml

## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true

## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false

## data 
datasets:
datasets:
  - path: estrogen/bookscpt2
    type: completion
    field: text


shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain

## Liger + CCE
plugins:
  - axolotl.integrations.liger.LigerPlugin
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true

## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true

## max grad norm
max_grad_norm: 1.0


## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:

## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5

train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false

gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:

warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
   pad_token: <pad>

Axolotl Configyaml

## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true

## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false

## data 
datasets:
datasets:
  - path: estrogen/bookscpt2
    type: completion
    field: text


shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain

## Liger + CCE
plugins:
  - axolotl.integrations.liger.LigerPlugin
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true

## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true

## max grad norm
max_grad_norm: 1.0


## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:

## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5

train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false

gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:

warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
   pad_token: <pad>

Axolotl Configyaml

## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true

## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false

## data 
datasets:
datasets:
  - path: estrogen/bookscpt2
    type: completion
    field: text


shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain

## Liger + CCE
plugins:
  - axolotl.integrations.liger.LigerPlugin
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true

## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true

## max grad norm
max_grad_norm: 1.0


## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:

## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5

train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false

gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:

warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
   pad_token: <pad>

Axolotl Configyaml

## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true

## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false

## data 
datasets:
datasets:
  - path: estrogen/bookscpt2
    type: completion
    field: text


shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain

## Liger + CCE
plugins:
  - axolotl.integrations.liger.LigerPlugin
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true

## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true

## max grad norm
max_grad_norm: 1.0


## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:

## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5

train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false

gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:

warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
   pad_token: <pad>

Axolotl Configyaml

## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true

## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false

## data 
datasets:
datasets:
  - path: estrogen/bookscpt2
    type: completion
    field: text


shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain

## Liger + CCE
plugins:
  - axolotl.integrations.liger.LigerPlugin
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true

## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true

## max grad norm
max_grad_norm: 1.0


## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:

## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5

train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false

gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:

warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
   pad_token: <pad>

Axolotl Configyaml

## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true

## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false

## data 
datasets:
datasets:
  - path: estrogen/bookscpt2
    type: completion
    field: text


shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain

## Liger + CCE
plugins:
  - axolotl.integrations.liger.LigerPlugin
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true

## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true

## max grad norm
max_grad_norm: 1.0


## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:

## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5

train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false

gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:

warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
   pad_token: <pad>

Axolotl Configyaml

## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true

## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false

## data 
datasets:
datasets:
  - path: estrogen/bookscpt2
    type: completion
    field: text


shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain

## Liger + CCE
plugins:
  - axolotl.integrations.liger.LigerPlugin
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true

## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true

## max grad norm
max_grad_norm: 1.0


## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:

## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5

train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false

gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:

warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
   pad_token: <pad>

Axolotl Configyaml

## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true

## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false

## data 
datasets:
datasets:
  - path: estrogen/bookscpt2
    type: completion
    field: text


shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain

## Liger + CCE
plugins:
  - axolotl.integrations.liger.LigerPlugin
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true

## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true

## max grad norm
max_grad_norm: 1.0


## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:

## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5

train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false

gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:

warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
   pad_token: <pad>

Axolotl Configyaml

## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true

## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false

## data 
datasets:
datasets:
  - path: estrogen/bookscpt2
    type: completion
    field: text


shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain

## Liger + CCE
plugins:
  - axolotl.integrations.liger.LigerPlugin
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true

## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true

## max grad norm
max_grad_norm: 1.0


## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:

## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5

train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false

gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:

warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
   pad_token: <pad>

Axolotl Configyaml

## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true

## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false

## data 
datasets:
datasets:
  - path: estrogen/bookscpt2
    type: completion
    field: text


shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain

## Liger + CCE
plugins:
  - axolotl.integrations.liger.LigerPlugin
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true

## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true

## max grad norm
max_grad_norm: 1.0


## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:

## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5

train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false

gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:

warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
   pad_token: <pad>

Axolotl Configyaml

## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true

## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false

## data 
datasets:
datasets:
  - path: estrogen/bookscpt2
    type: completion
    field: text


shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain

## Liger + CCE
plugins:
  - axolotl.integrations.liger.LigerPlugin
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true

## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true

## max grad norm
max_grad_norm: 1.0


## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:

## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5

train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false

gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:

warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
   pad_token: <pad>

Axolotl Configyaml

## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true

## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false

## data 
datasets:
datasets:
  - path: estrogen/bookscpt2
    type: completion
    field: text


shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain

## Liger + CCE
plugins:
  - axolotl.integrations.liger.LigerPlugin
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true

## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true

## max grad norm
max_grad_norm: 1.0


## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:

## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5

train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false

gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:

warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
   pad_token: <pad>

Axolotl Configyaml

## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true

## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false

## data 
datasets:
datasets:
  - path: estrogen/bookscpt2
    type: completion
    field: text


shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain

## Liger + CCE
plugins:
  - axolotl.integrations.liger.LigerPlugin
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true

## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true

## max grad norm
max_grad_norm: 1.0


## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:

## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5

train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false

gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:

warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
   pad_token: <pad>

Axolotl Configyaml

## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true

## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false

## data 
datasets:
datasets:
  - path: estrogen/bookscpt2
    type: completion
    field: text


shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain

## Liger + CCE
plugins:
  - axolotl.integrations.liger.LigerPlugin
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true

## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true

## max grad norm
max_grad_norm: 1.0


## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:

## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5

train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false

gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:

warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
   pad_token: <pad>

Axolotl Configyaml

## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true

## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false

## data 
datasets:
datasets:
  - path: estrogen/bookscpt2
    type: completion
    field: text


shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain

## Liger + CCE
plugins:
  - axolotl.integrations.liger.LigerPlugin
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true

## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true

## max grad norm
max_grad_norm: 1.0


## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:

## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5

train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false

gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:

warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
   pad_token: <pad>

Axolotl Configyaml

## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true

## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false

## data 
datasets:
datasets:
  - path: estrogen/bookscpt2
    type: completion
    field: text


shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain

## Liger + CCE
plugins:
  - axolotl.integrations.liger.LigerPlugin
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true

## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true

## max grad norm
max_grad_norm: 1.0


## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:

## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5

train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false

gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:

warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
   pad_token: <pad>

Axolotl Configyaml

## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true

## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false

## data 
datasets:
datasets:
  - path: estrogen/bookscpt2
    type: completion
    field: text


shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain

## Liger + CCE
plugins:
  - axolotl.integrations.liger.LigerPlugin
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true

## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true

## max grad norm
max_grad_norm: 1.0


## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:

## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5

train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false

gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:

warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
   pad_token: <pad>

Axolotl Configyaml

## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true

## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false

## data 
datasets:
datasets:
  - path: estrogen/bookscpt2
    type: completion
    field: text


shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain

## Liger + CCE
plugins:
  - axolotl.integrations.liger.LigerPlugin
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true

## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true

## max grad norm
max_grad_norm: 1.0


## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:

## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5

train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false

gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:

warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
   pad_token: <pad>

Axolotl Configyaml

## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true

## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false

## data 
datasets:
datasets:
  - path: estrogen/bookscpt2
    type: completion
    field: text


shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain

## Liger + CCE
plugins:
  - axolotl.integrations.liger.LigerPlugin
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true

## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true

## max grad norm
max_grad_norm: 1.0


## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:

## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5

train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false

gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:

warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
   pad_token: <pad>

Axolotl Configyaml

## model
base_model: allura-forge/nemo-upscaled-2
#tokenizer_use_mistral_common: true

## qlora COPE!!!
load_in_8bit: false
load_in_4bit: false
strict: false

## data 
datasets:
datasets:
  - path: estrogen/bookscpt2
    type: completion
    field: text


shuffle_merged_datasets: true
dataset_prepared_path: dataset_preparedss
val_set_size: 0.0
output_dir: ./Pretrain

## Liger + CCE
plugins:
  - axolotl.integrations.liger.LigerPlugin
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: false
cut_cross_entropy: true

## CTX settings
sequence_len: 32768
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true

## max grad norm
max_grad_norm: 1.0


## WandB
wandb_project: NeMo-Upscale
wandb_entity:
wandb_watch:
wandb_name: Pretrain-22B
wandb_log_model:

## hoe params
gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: rex
learning_rate: 2e-5

train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false

gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true
s2_attention:

warmup_steps: 50
saves_per_epoch: 2
debug:
deepspeed: ./deepspeed_configs/zero3_bf16.json
weight_decay: 0.0025
fsdp:
fsdp_config:
special_tokens:
   pad_token: <pad>

Deploy This Model

Production-ready deployment in minutes

Together.ai

Instant API access to this model

Fastest API

Production-ready inference API. Start free, scale to millions.

Try Free API

Replicate

One-click model deployment

Easiest Setup

Run models in the cloud with simple API. No DevOps required.

Deploy Now

Disclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.