EVA-Qwen3-Next-v0.0
60
3
license:apache-2.0
by
EVA-UNIT-01
Code Model
OTHER
0.3B params
New
60 downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
1GB+ RAM
Mobile
Laptop
Server
Quick Summary
AI model with specialized capabilities.
Device Compatibility
Mobile
4-6GB RAM
Laptop
16GB RAM
Server
GPU
Minimum Recommended
1GB+ RAM
Code Examples
yaml
_target_: megatron.bridge.training.config.ConfigContainer
checkpoint:
_target_: megatron.bridge.training.config.CheckpointConfig
async_save: false
ckpt_assume_constant_structure: false
ckpt_convert_format: null
ckpt_convert_save: null
ckpt_format: torch_dist
ckpt_step: null
dist_ckpt_optim_fully_reshardable: false
dist_ckpt_strictness: assume_ok_unexpected
distrib_optim_fully_reshardable_mem_efficient: false
exit_on_missing_checkpoint: false
finetune: false
fully_parallel_load: false
fully_parallel_save: true
load: workspace/results/qwen3_coder_next_base_lora/checkpoints
load_main_params_from_ckpt: false
load_optim: true
load_rng: true
most_recent_k: -1
non_persistent_ckpt_type: null
non_persistent_global_ckpt_dir: null
non_persistent_local_ckpt_algo: fully_parallel
non_persistent_local_ckpt_dir: null
non_persistent_save_interval: null
pretrained_checkpoint: workspace/models/Qwen3-Coder-Next-Base
replication: false
replication_factor: 2
replication_jump: null
save: workspace/results/qwen3_coder_next_base_lora/checkpoints
save_interval: 200
save_optim: true
save_rng: true
save_tokenizer_assets: true
strict_fsdp_dtensor_load: false
use_checkpoint_args: false
use_persistent_ckpt_worker: true
comm_overlap: null
dataset:
_target_: megatron.bridge.training.config.FinetuningDatasetConfig
data_sharding: true
dataloader_type: single
dataset_kwargs: null
dataset_root: workspace/datasets/lilith-v0.2/bridge_jsonl
do_test: true
do_validation: true
max_train_samples: null
memmap_workers: 1
num_workers: 8
packed_sequence_specs: null
persistent_workers: false
pin_memory: true
seed: 5678
seq_length: 2048
trust_remote_code: null
ddp:
_target_: megatron.core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig
align_param_gather: false
average_in_collective: true
bucket_size: null
check_for_large_grads: false
check_for_nan_in_grad: true
data_parallel_sharding_strategy: optim_grads_params
delay_wgrad_compute: false
disable_symmetric_registration: false
fp8_param_gather: false
fsdp_db_use_persist_buf_on_alloc_fail: false
fsdp_double_buffer: false
fsdp_manual_registration: false
grad_reduce_in_fp32: true
gradient_reduce_div_fusion: true
keep_fp8_transpose_cache: false
nccl_ub: false
num_distributed_optimizer_instances: 1
outer_dp_sharding_strategy: no_shard
overlap_grad_reduce: false
overlap_param_gather: false
pad_buckets_for_high_nccl_busbw: false
preserve_fp32_weights: true
reduce_scatter_with_fp32_accumulation: false
reuse_grad_buf_for_mxfp8_param_ag: false
suggested_communication_unit_size: null
use_custom_fsdp: false
use_distributed_optimizer: true
use_megatron_fsdp: false
dist:
_target_: megatron.bridge.training.config.DistributedInitConfig
align_grad_reduce: true
disable_jit_fuser: false
distributed_backend: nccl
distributed_timeout_minutes: 10
distributed_timeout_seconds_after_init: null
enable_megatron_core_experimental: false
external_gpu_device_mapping: false
high_priority_stream_groups: null
lazy_init: false
local_rank: 0
nccl_communicator_config_path: null
sharp_enabled_group: null
use_decentralized_pg: false
use_gloo_process_groups: true
use_megatron_fsdp: false
use_sharp: false
use_torch_fsdp2: false
use_tp_pp_dp_mapping: false
ft: null
inprocess_restart: null
logger:
_target_: megatron.bridge.training.config.LoggerConfig
filter_warnings: true
log_energy: false
log_interval: 1
log_l2_norm_grad_to_tensorboard: false
log_loss_scale_to_tensorboard: true
log_memory_to_tensorboard: false
log_params_norm: false
log_progress: false
log_runtime_to_tensorboard: false
log_throughput: false
log_throughput_to_tensorboard: false
log_timers_to_tensorboard: true
log_validation_ppl_to_tensorboard: false
log_world_size_to_tensorboard: false
logging_level: 20
memory_keys: null
mlflow_experiment: null
mlflow_run_name: null
mlflow_tags: null
mlflow_tracking_uri: null
modules_to_filter: null
runtime_time_unit: hours
save_config_filepath: null
set_level_for_all_loggers: false
skip_train_metrics_log: false
tensorboard_dir: workspace/results/qwen3_coder_next_base_lora/tb_logs
tensorboard_log_interval: 1
tensorboard_queue_size: 1000
throughput_window_size: 100
timing_log_level: 0
timing_log_option: minmax
wandb_entity: nottlespike
wandb_exp_name: qwen3-coder-next-lora-lr1e3
wandb_project: qwen3-coder-next-lora
wandb_save_dir: null
mixed_precision:
_target_: megatron.bridge.training.mixed_precision.MixedPrecisionConfig
autocast_dtype: null
autocast_enabled: false
bf16: true
first_last_layers_bf16: false
fp16: false
fp32: false
fp4: null
fp4_recipe: nvfp4
fp8: null
fp8_amax_compute_algo: most_recent
fp8_amax_history_len: 1
fp8_dot_product_attention: false
fp8_margin: 0
fp8_multi_head_attention: false
fp8_param: false
fp8_param_gather: false
fp8_recipe: tensorwise
fp8_wgrad: true
grad_reduce_in_fp32: true
hysteresis: 2
initial_loss_scale: 4294967296
loss_scale: null
loss_scale_window: 1000
min_loss_scale: 1.0
num_layers_at_end_in_bf16: 0
num_layers_at_start_in_bf16: 0
params_dtype:
_call_: false
_target_: torch.bfloat16
pipeline_dtype:
_call_: false
_target_: torch.bfloat16
reuse_grad_buf_for_mxfp8_param_ag: false
model:
_target_: megatron.bridge.models.qwen.qwen_provider.Qwen3NextModelProvider
account_for_embedding_in_pipeline_split: false
account_for_loss_in_pipeline_split: false
activation_func:
_call_: false
_target_: torch.nn.functional.silu
activation_func_clamp_value: null
activation_func_fp8_input_store: false
add_bias_linear: false
add_qkv_bias: false
apply_query_key_layer_scaling: false
apply_residual_connection_post_layernorm: false
apply_rope_fusion: false
async_tensor_model_parallel_allreduce: true
attention_backend:
_args_:
- 5
_call_: true
_name_: auto
_target_: megatron.core.transformer.enums.AttnBackend
attention_dropout: 0.0
attention_output_gate: true
attention_softmax_in_fp32: false
autocast_dtype:
_call_: false
_target_: torch.bfloat16
barrier_with_L1_time: true
batch_invariant_mode: false
batch_p2p_comm: true
batch_p2p_sync: true
bf16: true
bias_activation_fusion: false
bias_dropout_fusion: false
calculate_per_token_loss: false
clone_scatter_output_in_embedding: true
config_logger_dir: ''
context_parallel_size: 1
cp_comm_type: null
cpu_offloading: false
cpu_offloading_activations: true
cpu_offloading_double_buffering: false
cpu_offloading_num_layers: 0
cpu_offloading_weights: false
cross_entropy_fusion_impl: native
cross_entropy_loss_fusion: true
cuda_graph_impl: none
cuda_graph_retain_backward_graph: false
cuda_graph_scope: []
cuda_graph_use_single_mempool: false
cuda_graph_warmup_steps: 3
deallocate_pipeline_outputs: true
defer_embedding_wgrad_compute: false
delay_wgrad_compute: false
deterministic_mode: false
disable_bf16_reduced_precision_matmul: false
disable_parameter_transpose_cache: false
distribute_saved_activations: false
dsa_indexer_head_dim: null
dsa_indexer_loss_coeff: null
dsa_indexer_n_heads: null
dsa_indexer_topk: null
dsa_indexer_use_sparse_loss: false
embedding_init_method:
_args_: []
_partial_: true
_target_: torch.nn.init.normal_
mean: 0.0
std: 0.02
embedding_init_method_std: 0.02
enable_autocast: false
enable_cuda_graph: false
ep_overlap_early_attn_memory_release: false
experimental_attention_variant: gated_delta_net
expert_model_parallel_size: 8
expert_tensor_parallel_size: 1
external_cuda_graph: false
ffn_hidden_size: 5120
finalize_model_grads_func:
_args_: []
_partial_: true
_target_: megatron.core.distributed.finalize_model_grads.finalize_model_grads
pg_collection:
_call_: true
_target_: megatron.core.process_groups_config.ProcessGroupCollection
fine_grained_activation_offloading: false
first_last_layers_bf16: false
flash_decode: false
fp16: false
fp16_lm_cross_entropy: false
fp32_residual_connection: false
fp4: null
fp4_param: false
fp4_quantizer_factory: null
fp4_recipe: nvfp4
fp8: null
fp8_amax_compute_algo: most_recent
fp8_amax_history_len: 1
fp8_dot_product_attention: false
fp8_interval: 1
fp8_margin: 0
fp8_multi_head_attention: false
fp8_param: false
fp8_quantizer_factory: null
fp8_recipe: tensorwise
fp8_wgrad: true
fused_single_qkv_rope: false
gated_linear_unit: true
glu_linear_offset: 0.0
grad_scale_func:
_call_: false
_target_: megatron.core.optimizer.optimizer.MegatronOptimizer.scale_loss
grad_sync_func: null
gradient_accumulation_fusion: false
hetereogenous_dist_checkpoint: true
heterogeneous_block_specs: false
hf_model_id: Qwen/Qwen3-Coder-Next-Base
hidden_dropout: 0.0
hidden_size: 2048
hierarchical_context_parallel_sizes: null
hybrid_context_parallel: false
inference_fuse_tp_communication: false
inference_rng_tracker: false
inference_sampling_seed: 42
init_method:
_args_: []
_partial_: true
_target_: torch.nn.init.normal_
mean: 0.0
std: 0.02
init_method_std: 0.02
init_model_with_meta_device: false
is_hybrid_model: false
kitchen_attention_backend: sdpa
kv_channels: 256
layernorm_epsilon: 1.0e-06
layernorm_zero_centered_gamma: true
linear_attention_freq: 4
linear_conv_kernel_dim: 4
linear_key_head_dim: 128
linear_num_key_heads: 16
linear_num_value_heads: 32
linear_value_head_dim: 128
log_max_attention_logit: false
make_vocab_size_divisible_by: 128
mamba_head_dim: 64
mamba_num_groups: 8
mamba_num_heads: null
mamba_state_dim: 128
masked_softmax_fusion: true
max_position_embeddings: 40960
max_seqlen_per_dp_cp_rank: null
memory_efficient_layer_norm: false
microbatch_group_size_per_vp_stage: 1
min_offloaded_tensor_size: 1048576
mlp_chunks_for_prefill: 1
moe_apply_probs_on_input: false
moe_aux_loss_coeff: 0.001
moe_deepep_num_sms: 20
moe_enable_deepep: false
moe_enable_routing_replay: false
moe_expert_capacity_factor: null
moe_extended_tp: false
moe_ffn_hidden_size: 512
moe_flex_dispatcher_backend: deepep
moe_grouped_gemm: true
moe_hybridep_num_sms: 16
moe_input_jitter_eps: null
moe_latent_size: null
moe_layer_freq: 1
moe_layer_recompute: false
moe_pad_expert_input_to_capacity: false
moe_pad_experts_for_cuda_graph_inference: false
moe_per_layer_logging: false
moe_permute_fusion: true
moe_router_bias_update_rate: 0.001
moe_router_dtype: fp32
moe_router_enable_expert_bias: false
moe_router_force_load_balancing: false
moe_router_fusion: false
moe_router_group_topk: null
moe_router_load_balancing_type: global_aux_loss
moe_router_num_groups: null
moe_router_padding_for_fp8: false
moe_router_padding_for_quantization: false
moe_router_pre_softmax: false
moe_router_score_function: softmax
moe_router_topk: 10
moe_router_topk_limited_devices: null
moe_router_topk_scaling_factor: null
moe_shared_expert_gate: true
moe_shared_expert_intermediate_size: 512
moe_shared_expert_overlap: false
moe_token_dispatcher_type: alltoall
moe_token_drop_policy: probs
moe_token_dropping: false
moe_use_legacy_grouped_gemm: false
moe_z_loss_coeff: null
mrope_section: null
mtp_enabled: false
mtp_hybrid_override_pattern: null
mtp_loss_scaling_factor: 0.1
mtp_num_layers: null
mtp_standalone: false
mtp_use_repeated_layer: false
multi_latent_attention: false
nccl_all_reduce_for_prefill: false
no_rope_freq: null
no_sync_func: null
normalization: RMSNorm
num_attention_heads: 16
num_layers: 48
num_layers_at_end_in_bf16: 0
num_layers_at_start_in_bf16: 0
num_layers_in_first_pipeline_stage: null
num_layers_in_last_pipeline_stage: null
num_microbatches_with_partial_activation_checkpoints: null
num_moe_experts: 512
num_query_groups: 2
offload_modules: []
output_layer_init_method:
_args_: []
_partial_: true
_target_: torch.nn.init.normal_
mean: 0.0
std: 0.0020412414523193153
overlap_moe_expert_parallel_comm: false
overlap_p2p_comm: false
overlap_p2p_comm_warmup_flush: false
parallel_output: true
param_sync_func: null
params_dtype:
_call_: false
_target_: torch.bfloat16
perform_initialization: true
persist_layer_norm: false
pipeline_dtype:
_call_: false
_target_: torch.bfloat16
pipeline_model_parallel_comm_backend: null
pipeline_model_parallel_layout: null
pipeline_model_parallel_size: 1
position_embedding_type: rope
qk_clip: false
qk_clip_alpha: 0.5
qk_clip_threshold: 100
qk_l2_norm: false
qk_layernorm: true
quant_recipe: null
recompute_granularity: selective
recompute_method: null
recompute_modules:
- layernorm
- moe
- moe_act
recompute_num_layers: null
restore_modelopt_state: false
rope_scaling: false
rope_scaling_factor: 1.0
rotary_base: 5000000
rotary_interleaved: false
rotary_percent: 0.25
rotary_scaling_factor: null
scatter_embedding_sequence_parallel: true
seq_len_interpolation_factor: null
seq_length: 2048
sequence_parallel: false
share_embeddings_and_output_weights: false
should_pad_vocab: false
softmax_scale: null
softmax_type: vanilla
symmetric_ar_type: null
tensor_model_parallel_size: 1
test_mode: false
timers:
_call_: true
_target_: megatron.core.timers.Timers
tp_comm_atomic_ag: false
tp_comm_atomic_rs: false
tp_comm_bootstrap_backend: nccl
tp_comm_bulk_dgrad: true
tp_comm_bulk_wgrad: true
tp_comm_overlap: false
tp_comm_overlap_ag: true
tp_comm_overlap_cfg: null
tp_comm_overlap_disable_fc1: false
tp_comm_overlap_disable_qkv: false
tp_comm_overlap_rs: true
tp_comm_overlap_rs_dgrad: false
tp_comm_split_ag: true
tp_comm_split_rs: true
tp_only_amax_red: false
transformer_impl: transformer_engine
transformer_layer_spec:
_call_: false
_target_: megatron.core.models.gpt.experimental_attention_variant_module_specs.get_transformer_block_with_experimental_attention_variant_spec
use_arbitrary_attention_mask: null
use_cpu_initialization: false
use_fused_weighted_squared_relu: false
use_inference_optimized_layers: false
use_kitchen: false
use_kitchen_attention: false
use_mamba_mem_eff_path: true
use_ring_exchange_p2p: false
use_te_activation_func: false
use_te_rng_tracker: false
use_transformer_engine_full_layer_spec: false
use_transformer_engine_op_fuser: false
variable_seq_lengths: false
virtual_pipeline_model_parallel_size: null
vocab_size: 151936
wgrad_deferral_limit: 0
window_attn_skip_freq: null
window_size: null
nvrx_straggler: null
optimizer:
_target_: megatron.bridge.training.config.OptimizerConfig
adam_beta1: 0.9
adam_beta2: 0.98
adam_eps: 1.0e-08
apply_wd_to_qk_layernorm: false
barrier_with_L1_time: false
bf16: true
clip_grad: 1.0
config_logger_dir: ''
decoupled_lr: null
decoupled_min_lr: null
decoupled_weight_decay: true
exp_avg_dtype:
_call_: false
_target_: torch.float32
exp_avg_sq_dtype:
_call_: false
_target_: torch.float32
fp16: false
fp8_recipe: tensorwise
hysteresis: 2
initial_loss_scale: 4294967296
log_num_zeros_in_grad: false
loss_scale: null
loss_scale_window: 1000
lr: 0.001
main_grads_dtype:
_call_: false
_target_: torch.float32
main_params_dtype:
_call_: false
_target_: torch.float32
min_loss_scale: 1.0
min_lr: 0.0001
muon_extra_scale_factor: 1.0
muon_fp32_matmul_prec: medium
muon_momentum: 0.95
muon_num_ns_steps: 5
muon_scale_mode: spectral
muon_split_qkv: true
muon_tp_mode: blockwise
muon_use_nesterov: false
optimizer: adam
optimizer_cpu_offload: false
optimizer_offload_fraction: 0.0
overlap_cpu_optimizer_d2h_h2d: false
overlap_param_gather: false
overlap_param_gather_with_optimizer_step: false
params_dtype:
_call_: false
_target_: torch.bfloat16
pin_cpu_grads: true
pin_cpu_params: true
reuse_grad_buf_for_mxfp8_param_ag: false
sgd_momentum: 0.9
store_param_remainders: true
timers:
_call_: true
_target_: megatron.core.timers.Timers
use_distributed_optimizer: true
use_precision_aware_optimizer: false
use_torch_optimizer_for_cpu_offload: false
weight_decay: 0.1
optimizer_config_override_provider:
_target_: megatron.bridge.training.config.OptimizerConfigOverrideProvider
peft:
_target_: megatron.bridge.peft.lora.LoRA
a2a_experimental: false
alpha: 64
canonical_mapping: {}
dim: 64
dropout: 0.0
dropout_position: pre
exclude_modules: []
lora_A_init_method: xavier
lora_B_init_method: zero
lora_dtype: null
params_to_save: !!set
decoder.layers.0.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.0.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.0.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.0.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.0.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.0.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.0.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.0.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.1.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.1.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.1.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.1.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.1.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.1.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.1.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.1.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.10.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.10.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.10.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.10.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.10.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.10.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.10.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.10.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.11.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.11.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.11.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.11.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.11.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.11.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.11.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.11.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.11.self_attention.linear_proj.adapter.linear_in.weight: null
decoder.layers.11.self_attention.linear_proj.adapter.linear_out.weight: null
decoder.layers.11.self_attention.linear_qkv.adapter.linear_in.weight: null
decoder.layers.11.self_attention.linear_qkv.adapter.linear_out.weight: null
decoder.layers.12.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.12.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.12.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.12.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.12.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.12.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.12.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.12.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.13.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.13.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.13.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.13.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.13.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.13.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.13.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.13.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.14.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.14.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.14.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.14.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.14.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.14.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.14.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.14.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.15.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.15.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.15.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.15.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.15.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.15.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.15.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.15.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.15.self_attention.linear_proj.adapter.linear_in.weight: null
decoder.layers.15.self_attention.linear_proj.adapter.linear_out.weight: null
decoder.layers.15.self_attention.linear_qkv.adapter.linear_in.weight: null
decoder.layers.15.self_attention.linear_qkv.adapter.linear_out.weight: null
decoder.layers.16.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.16.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.16.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.16.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.16.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.16.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.16.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.16.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.17.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.17.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.17.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.17.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.17.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.17.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.17.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.17.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.18.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.18.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.18.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.18.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.18.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.18.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.18.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.18.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.19.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.19.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.19.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.19.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.19.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.19.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.19.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.19.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.19.self_attention.linear_proj.adapter.linear_in.weight: null
decoder.layers.19.self_attention.linear_proj.adapter.linear_out.weight: null
decoder.layers.19.self_attention.linear_qkv.adapter.linear_in.weight: null
decoder.layers.19.self_attention.linear_qkv.adapter.linear_out.weight: null
decoder.layers.2.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.2.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.2.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.2.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.2.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.2.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.2.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.2.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.20.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.20.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.20.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.20.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.20.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.20.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.20.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.20.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.21.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.21.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.21.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.21.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.21.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.21.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.21.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.21.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.22.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.22.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.22.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.22.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.22.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.22.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.22.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.22.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.23.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.23.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.23.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.23.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.23.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.23.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.23.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.23.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.23.self_attention.linear_proj.adapter.linear_in.weight: null
decoder.layers.23.self_attention.linear_proj.adapter.linear_out.weight: null
decoder.layers.23.self_attention.linear_qkv.adapter.linear_in.weight: null
decoder.layers.23.self_attention.linear_qkv.adapter.linear_out.weight: null
decoder.layers.24.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.24.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.24.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.24.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.24.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.24.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.24.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.24.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.25.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.25.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.25.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.25.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.25.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.25.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.25.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.25.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.26.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.26.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.26.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.26.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.26.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.26.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.26.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.26.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.27.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.27.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.27.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.27.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.27.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.27.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.27.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.27.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.27.self_attention.linear_proj.adapter.linear_in.weight: null
decoder.layers.27.self_attention.linear_proj.adapter.linear_out.weight: null
decoder.layers.27.self_attention.linear_qkv.adapter.linear_in.weight: null
decoder.layers.27.self_attention.linear_qkv.adapter.linear_out.weight: null
decoder.layers.28.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.28.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.28.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.28.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.28.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.28.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.28.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.28.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.29.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.29.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.29.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.29.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.29.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.29.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.29.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.29.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.3.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.3.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.3.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.3.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.3.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.3.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.3.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.3.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.3.self_attention.linear_proj.adapter.linear_in.weight: null
decoder.layers.3.self_attention.linear_proj.adapter.linear_out.weight: null
decoder.layers.3.self_attention.linear_qkv.adapter.linear_in.weight: null
decoder.layers.3.self_attention.linear_qkv.adapter.linear_out.weight: null
decoder.layers.30.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.30.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.30.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.30.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.30.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.30.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.30.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.30.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.31.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.31.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.31.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.31.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.31.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.31.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.31.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.31.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.31.self_attention.linear_proj.adapter.linear_in.weight: null
decoder.layers.31.self_attention.linear_proj.adapter.linear_out.weight: null
decoder.layers.31.self_attention.linear_qkv.adapter.linear_in.weight: null
decoder.layers.31.self_attention.linear_qkv.adapter.linear_out.weight: null
decoder.layers.32.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.32.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.32.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.32.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.32.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.32.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.32.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.32.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.33.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.33.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.33.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.33.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.33.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.33.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.33.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.33.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.34.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.34.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.34.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.34.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.34.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.34.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.34.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.34.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.35.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.35.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.35.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.35.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.35.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.35.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.35.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.35.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.35.self_attention.linear_proj.adapter.linear_in.weight: null
decoder.layers.35.self_attention.linear_proj.adapter.linear_out.weight: null
decoder.layers.35.self_attention.linear_qkv.adapter.linear_in.weight: null
decoder.layers.35.self_attention.linear_qkv.adapter.linear_out.weight: null
decoder.layers.36.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.36.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.36.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.36.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.36.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.36.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.36.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.36.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.37.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.37.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.37.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.37.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.37.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.37.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.37.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.37.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.38.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.38.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.38.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.38.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.38.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.38.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.38.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.38.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.39.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.39.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.39.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.39.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.39.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.39.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.39.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.39.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.39.self_attention.linear_proj.adapter.linear_in.weight: null
decoder.layers.39.self_attention.linear_proj.adapter.linear_out.weight: null
decoder.layers.39.self_attention.linear_qkv.adapter.linear_in.weight: null
decoder.layers.39.self_attention.linear_qkv.adapter.linear_out.weight: null
decoder.layers.4.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.4.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.4.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.4.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.4.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.4.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.4.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.4.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.40.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.40.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.40.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.40.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.40.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.40.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.40.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.40.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.41.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.41.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.41.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.41.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.41.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.41.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.41.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.41.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.42.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.42.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.42.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.42.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.42.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.42.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.42.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.42.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.43.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.43.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.43.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.43.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.43.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.43.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.43.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.43.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.43.self_attention.linear_proj.adapter.linear_in.weight: null
decoder.layers.43.self_attention.linear_proj.adapter.linear_out.weight: null
decoder.layers.43.self_attention.linear_qkv.adapter.linear_in.weight: null
decoder.layers.43.self_attention.linear_qkv.adapter.linear_out.weight: null
decoder.layers.44.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.44.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.44.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.44.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.44.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.44.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.44.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.44.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.45.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.45.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.45.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.45.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.45.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.45.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.45.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.45.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.46.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.46.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.46.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.46.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.46.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.46.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.46.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.46.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.47.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.47.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.47.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.47.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.47.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.47.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.47.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.47.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.47.self_attention.linear_proj.adapter.linear_in.weight: null
decoder.layers.47.self_attention.linear_proj.adapter.linear_out.weight: null
decoder.layers.47.self_attention.linear_qkv.adapter.linear_in.weight: null
decoder.layers.47.self_attention.linear_qkv.adapter.linear_out.weight: null
decoder.layers.5.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.5.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.5.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.5.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.5.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.5.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.5.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.5.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.6.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.6.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.6.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.6.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.6.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.6.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.6.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.6.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.7.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.7.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.7.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.7.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.7.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.7.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.7.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.7.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.7.self_attention.linear_proj.adapter.linear_in.weight: null
decoder.layers.7.self_attention.linear_proj.adapter.linear_out.weight: null
decoder.layers.7.self_attention.linear_qkv.adapter.linear_in.weight: null
decoder.layers.7.self_attention.linear_qkv.adapter.linear_out.weight: null
decoder.layers.8.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.8.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.8.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.8.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.8.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.8.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.8.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.8.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.9.mlp.experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.9.mlp.experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.9.mlp.experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.9.mlp.experts.linear_fc2.adapter.linear_out.weight: null
decoder.layers.9.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
decoder.layers.9.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
decoder.layers.9.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
decoder.layers.9.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
target_modules:
- linear_qkv
- linear_proj
- linear_fc1
- linear_fc2
profiling:
_target_: megatron.bridge.training.config.ProfilingConfig
memory_snapshot_path: snapshot.pickle
nvtx_ranges: false
profile_ranks: []
profile_step_end: 12
profile_step_start: 10
pytorch_profiler_collect_callstack: false
pytorch_profiler_collect_chakra: false
pytorch_profiler_collect_shapes: false
record_memory_history: false
record_shapes: false
use_nsys_profiler: false
use_pytorch_profiler: false
rerun_state_machine:
_target_: megatron.bridge.training.config.RerunStateMachineConfig
check_for_nan_in_loss: true
check_for_spiky_loss: false
error_injection_rate: 0
error_injection_type: transient_error
rerun_mode: disabled
spiky_loss_factor: 10.0
rng:
_target_: megatron.bridge.training.config.RNGConfig
data_parallel_random_init: false
inference_rng_tracker: false
seed: 5678
te_rng_tracker: false
scheduler:
_target_: megatron.bridge.training.config.SchedulerConfig
end_weight_decay: 0.033
lr_decay_iters: 2000
lr_decay_samples: null
lr_decay_steps: 64000
lr_decay_style: cosine
lr_warmup_fraction: null
lr_warmup_init: 0.0
lr_warmup_iters: 50
lr_warmup_samples: 0
lr_warmup_steps: 1600
lr_wsd_decay_iters: null
lr_wsd_decay_samples: null
lr_wsd_decay_style: exponential
no_weight_decay_cond_type: qwen3_next
override_opt_param_scheduler: true
start_weight_decay: 0.033
use_checkpoint_opt_param_scheduler: false
wd_incr_steps: 64000
weight_decay_incr_style: constant
wsd_decay_steps: null
straggler: null
tensor_inspect: null
tokenizer:
_target_: megatron.bridge.training.tokenizers.config.TokenizerConfig
chat_template: null
force_system_message: false
hf_tokenizer_kwargs: {}
image_tag_type: null
merge_file: null
metadata_path: null
sp_tokenizer_kwargs: {}
special_tokens: null
tiktoken_num_special_tokens: 1000
tiktoken_pattern: null
tiktoken_special_tokens: null
tokenizer_model: Qwen/Qwen3-Coder-Next-Base
tokenizer_prompt_format: null
tokenizer_type: HuggingFaceTokenizer
vocab_extra_ids: 0
vocab_file: null
vocab_size: null
train:
_target_: megatron.bridge.training.config.TrainingConfig
check_optimizer_step_success: true
check_weight_hash_across_dp_replicas_interval: null
decrease_batch_size_if_needed: false
empty_unused_memory_level: 0
eval_interval: null
eval_iters: null
exit_duration_in_mins: null
exit_interval: null
exit_signal:
_args_:
- 15
_call_: true
_name_: SIGTERM
_target_: signal.Signals
exit_signal_handler: false
exit_signal_handler_for_dataloader: false
global_batch_size: 32
iterations_to_skip: []
manual_gc: true
manual_gc_eval: 100
manual_gc_interval: 100
micro_batch_size: 1
rampup_batch_size: null
skip_sync_grad_norm_across_mp: false
skip_train: null
train_iters: 2000
train_samples: null
train_sync_interval: null
validation:
_target_: megatron.bridge.training.config.ValidationConfig
eval_interval: 9999
eval_iters: 32
skip_train: falseDeploy This Model
Production-ready deployment in minutes
Together.ai
Instant API access to this model
Production-ready inference API. Start free, scale to millions.
Try Free APIReplicate
One-click model deployment
Run models in the cloud with simple API. No DevOps required.
Deploy NowDisclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.