EVA-Qwen3-Next-v0.0

60
3
license:apache-2.0
by
EVA-UNIT-01
Code Model
OTHER
0.3B params
New
60 downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
1GB+ RAM
Mobile
Laptop
Server
Quick Summary

AI model with specialized capabilities.

Device Compatibility

Mobile
4-6GB RAM
Laptop
16GB RAM
Server
GPU
Minimum Recommended
1GB+ RAM

Code Examples

yaml
_target_: megatron.bridge.training.config.ConfigContainer
checkpoint:
  _target_: megatron.bridge.training.config.CheckpointConfig
  async_save: false
  ckpt_assume_constant_structure: false
  ckpt_convert_format: null
  ckpt_convert_save: null
  ckpt_format: torch_dist
  ckpt_step: null
  dist_ckpt_optim_fully_reshardable: false
  dist_ckpt_strictness: assume_ok_unexpected
  distrib_optim_fully_reshardable_mem_efficient: false
  exit_on_missing_checkpoint: false
  finetune: false
  fully_parallel_load: false
  fully_parallel_save: true
  load: workspace/results/qwen3_coder_next_base_lora/checkpoints
  load_main_params_from_ckpt: false
  load_optim: true
  load_rng: true
  most_recent_k: -1
  non_persistent_ckpt_type: null
  non_persistent_global_ckpt_dir: null
  non_persistent_local_ckpt_algo: fully_parallel
  non_persistent_local_ckpt_dir: null
  non_persistent_save_interval: null
  pretrained_checkpoint: workspace/models/Qwen3-Coder-Next-Base
  replication: false
  replication_factor: 2
  replication_jump: null
  save: workspace/results/qwen3_coder_next_base_lora/checkpoints
  save_interval: 200
  save_optim: true
  save_rng: true
  save_tokenizer_assets: true
  strict_fsdp_dtensor_load: false
  use_checkpoint_args: false
  use_persistent_ckpt_worker: true
comm_overlap: null
dataset:
  _target_: megatron.bridge.training.config.FinetuningDatasetConfig
  data_sharding: true
  dataloader_type: single
  dataset_kwargs: null
  dataset_root: workspace/datasets/lilith-v0.2/bridge_jsonl
  do_test: true
  do_validation: true
  max_train_samples: null
  memmap_workers: 1
  num_workers: 8
  packed_sequence_specs: null
  persistent_workers: false
  pin_memory: true
  seed: 5678
  seq_length: 2048
  trust_remote_code: null
ddp:
  _target_: megatron.core.distributed.distributed_data_parallel_config.DistributedDataParallelConfig
  align_param_gather: false
  average_in_collective: true
  bucket_size: null
  check_for_large_grads: false
  check_for_nan_in_grad: true
  data_parallel_sharding_strategy: optim_grads_params
  delay_wgrad_compute: false
  disable_symmetric_registration: false
  fp8_param_gather: false
  fsdp_db_use_persist_buf_on_alloc_fail: false
  fsdp_double_buffer: false
  fsdp_manual_registration: false
  grad_reduce_in_fp32: true
  gradient_reduce_div_fusion: true
  keep_fp8_transpose_cache: false
  nccl_ub: false
  num_distributed_optimizer_instances: 1
  outer_dp_sharding_strategy: no_shard
  overlap_grad_reduce: false
  overlap_param_gather: false
  pad_buckets_for_high_nccl_busbw: false
  preserve_fp32_weights: true
  reduce_scatter_with_fp32_accumulation: false
  reuse_grad_buf_for_mxfp8_param_ag: false
  suggested_communication_unit_size: null
  use_custom_fsdp: false
  use_distributed_optimizer: true
  use_megatron_fsdp: false
dist:
  _target_: megatron.bridge.training.config.DistributedInitConfig
  align_grad_reduce: true
  disable_jit_fuser: false
  distributed_backend: nccl
  distributed_timeout_minutes: 10
  distributed_timeout_seconds_after_init: null
  enable_megatron_core_experimental: false
  external_gpu_device_mapping: false
  high_priority_stream_groups: null
  lazy_init: false
  local_rank: 0
  nccl_communicator_config_path: null
  sharp_enabled_group: null
  use_decentralized_pg: false
  use_gloo_process_groups: true
  use_megatron_fsdp: false
  use_sharp: false
  use_torch_fsdp2: false
  use_tp_pp_dp_mapping: false
ft: null
inprocess_restart: null
logger:
  _target_: megatron.bridge.training.config.LoggerConfig
  filter_warnings: true
  log_energy: false
  log_interval: 1
  log_l2_norm_grad_to_tensorboard: false
  log_loss_scale_to_tensorboard: true
  log_memory_to_tensorboard: false
  log_params_norm: false
  log_progress: false
  log_runtime_to_tensorboard: false
  log_throughput: false
  log_throughput_to_tensorboard: false
  log_timers_to_tensorboard: true
  log_validation_ppl_to_tensorboard: false
  log_world_size_to_tensorboard: false
  logging_level: 20
  memory_keys: null
  mlflow_experiment: null
  mlflow_run_name: null
  mlflow_tags: null
  mlflow_tracking_uri: null
  modules_to_filter: null
  runtime_time_unit: hours
  save_config_filepath: null
  set_level_for_all_loggers: false
  skip_train_metrics_log: false
  tensorboard_dir: workspace/results/qwen3_coder_next_base_lora/tb_logs
  tensorboard_log_interval: 1
  tensorboard_queue_size: 1000
  throughput_window_size: 100
  timing_log_level: 0
  timing_log_option: minmax
  wandb_entity: nottlespike
  wandb_exp_name: qwen3-coder-next-lora-lr1e3
  wandb_project: qwen3-coder-next-lora
  wandb_save_dir: null
mixed_precision:
  _target_: megatron.bridge.training.mixed_precision.MixedPrecisionConfig
  autocast_dtype: null
  autocast_enabled: false
  bf16: true
  first_last_layers_bf16: false
  fp16: false
  fp32: false
  fp4: null
  fp4_recipe: nvfp4
  fp8: null
  fp8_amax_compute_algo: most_recent
  fp8_amax_history_len: 1
  fp8_dot_product_attention: false
  fp8_margin: 0
  fp8_multi_head_attention: false
  fp8_param: false
  fp8_param_gather: false
  fp8_recipe: tensorwise
  fp8_wgrad: true
  grad_reduce_in_fp32: true
  hysteresis: 2
  initial_loss_scale: 4294967296
  loss_scale: null
  loss_scale_window: 1000
  min_loss_scale: 1.0
  num_layers_at_end_in_bf16: 0
  num_layers_at_start_in_bf16: 0
  params_dtype:
    _call_: false
    _target_: torch.bfloat16
  pipeline_dtype:
    _call_: false
    _target_: torch.bfloat16
  reuse_grad_buf_for_mxfp8_param_ag: false
model:
  _target_: megatron.bridge.models.qwen.qwen_provider.Qwen3NextModelProvider
  account_for_embedding_in_pipeline_split: false
  account_for_loss_in_pipeline_split: false
  activation_func:
    _call_: false
    _target_: torch.nn.functional.silu
  activation_func_clamp_value: null
  activation_func_fp8_input_store: false
  add_bias_linear: false
  add_qkv_bias: false
  apply_query_key_layer_scaling: false
  apply_residual_connection_post_layernorm: false
  apply_rope_fusion: false
  async_tensor_model_parallel_allreduce: true
  attention_backend:
    _args_:
    - 5
    _call_: true
    _name_: auto
    _target_: megatron.core.transformer.enums.AttnBackend
  attention_dropout: 0.0
  attention_output_gate: true
  attention_softmax_in_fp32: false
  autocast_dtype:
    _call_: false
    _target_: torch.bfloat16
  barrier_with_L1_time: true
  batch_invariant_mode: false
  batch_p2p_comm: true
  batch_p2p_sync: true
  bf16: true
  bias_activation_fusion: false
  bias_dropout_fusion: false
  calculate_per_token_loss: false
  clone_scatter_output_in_embedding: true
  config_logger_dir: ''
  context_parallel_size: 1
  cp_comm_type: null
  cpu_offloading: false
  cpu_offloading_activations: true
  cpu_offloading_double_buffering: false
  cpu_offloading_num_layers: 0
  cpu_offloading_weights: false
  cross_entropy_fusion_impl: native
  cross_entropy_loss_fusion: true
  cuda_graph_impl: none
  cuda_graph_retain_backward_graph: false
  cuda_graph_scope: []
  cuda_graph_use_single_mempool: false
  cuda_graph_warmup_steps: 3
  deallocate_pipeline_outputs: true
  defer_embedding_wgrad_compute: false
  delay_wgrad_compute: false
  deterministic_mode: false
  disable_bf16_reduced_precision_matmul: false
  disable_parameter_transpose_cache: false
  distribute_saved_activations: false
  dsa_indexer_head_dim: null
  dsa_indexer_loss_coeff: null
  dsa_indexer_n_heads: null
  dsa_indexer_topk: null
  dsa_indexer_use_sparse_loss: false
  embedding_init_method:
    _args_: []
    _partial_: true
    _target_: torch.nn.init.normal_
    mean: 0.0
    std: 0.02
  embedding_init_method_std: 0.02
  enable_autocast: false
  enable_cuda_graph: false
  ep_overlap_early_attn_memory_release: false
  experimental_attention_variant: gated_delta_net
  expert_model_parallel_size: 8
  expert_tensor_parallel_size: 1
  external_cuda_graph: false
  ffn_hidden_size: 5120
  finalize_model_grads_func:
    _args_: []
    _partial_: true
    _target_: megatron.core.distributed.finalize_model_grads.finalize_model_grads
    pg_collection:
      _call_: true
      _target_: megatron.core.process_groups_config.ProcessGroupCollection
  fine_grained_activation_offloading: false
  first_last_layers_bf16: false
  flash_decode: false
  fp16: false
  fp16_lm_cross_entropy: false
  fp32_residual_connection: false
  fp4: null
  fp4_param: false
  fp4_quantizer_factory: null
  fp4_recipe: nvfp4
  fp8: null
  fp8_amax_compute_algo: most_recent
  fp8_amax_history_len: 1
  fp8_dot_product_attention: false
  fp8_interval: 1
  fp8_margin: 0
  fp8_multi_head_attention: false
  fp8_param: false
  fp8_quantizer_factory: null
  fp8_recipe: tensorwise
  fp8_wgrad: true
  fused_single_qkv_rope: false
  gated_linear_unit: true
  glu_linear_offset: 0.0
  grad_scale_func:
    _call_: false
    _target_: megatron.core.optimizer.optimizer.MegatronOptimizer.scale_loss
  grad_sync_func: null
  gradient_accumulation_fusion: false
  hetereogenous_dist_checkpoint: true
  heterogeneous_block_specs: false
  hf_model_id: Qwen/Qwen3-Coder-Next-Base
  hidden_dropout: 0.0
  hidden_size: 2048
  hierarchical_context_parallel_sizes: null
  hybrid_context_parallel: false
  inference_fuse_tp_communication: false
  inference_rng_tracker: false
  inference_sampling_seed: 42
  init_method:
    _args_: []
    _partial_: true
    _target_: torch.nn.init.normal_
    mean: 0.0
    std: 0.02
  init_method_std: 0.02
  init_model_with_meta_device: false
  is_hybrid_model: false
  kitchen_attention_backend: sdpa
  kv_channels: 256
  layernorm_epsilon: 1.0e-06
  layernorm_zero_centered_gamma: true
  linear_attention_freq: 4
  linear_conv_kernel_dim: 4
  linear_key_head_dim: 128
  linear_num_key_heads: 16
  linear_num_value_heads: 32
  linear_value_head_dim: 128
  log_max_attention_logit: false
  make_vocab_size_divisible_by: 128
  mamba_head_dim: 64
  mamba_num_groups: 8
  mamba_num_heads: null
  mamba_state_dim: 128
  masked_softmax_fusion: true
  max_position_embeddings: 40960
  max_seqlen_per_dp_cp_rank: null
  memory_efficient_layer_norm: false
  microbatch_group_size_per_vp_stage: 1
  min_offloaded_tensor_size: 1048576
  mlp_chunks_for_prefill: 1
  moe_apply_probs_on_input: false
  moe_aux_loss_coeff: 0.001
  moe_deepep_num_sms: 20
  moe_enable_deepep: false
  moe_enable_routing_replay: false
  moe_expert_capacity_factor: null
  moe_extended_tp: false
  moe_ffn_hidden_size: 512
  moe_flex_dispatcher_backend: deepep
  moe_grouped_gemm: true
  moe_hybridep_num_sms: 16
  moe_input_jitter_eps: null
  moe_latent_size: null
  moe_layer_freq: 1
  moe_layer_recompute: false
  moe_pad_expert_input_to_capacity: false
  moe_pad_experts_for_cuda_graph_inference: false
  moe_per_layer_logging: false
  moe_permute_fusion: true
  moe_router_bias_update_rate: 0.001
  moe_router_dtype: fp32
  moe_router_enable_expert_bias: false
  moe_router_force_load_balancing: false
  moe_router_fusion: false
  moe_router_group_topk: null
  moe_router_load_balancing_type: global_aux_loss
  moe_router_num_groups: null
  moe_router_padding_for_fp8: false
  moe_router_padding_for_quantization: false
  moe_router_pre_softmax: false
  moe_router_score_function: softmax
  moe_router_topk: 10
  moe_router_topk_limited_devices: null
  moe_router_topk_scaling_factor: null
  moe_shared_expert_gate: true
  moe_shared_expert_intermediate_size: 512
  moe_shared_expert_overlap: false
  moe_token_dispatcher_type: alltoall
  moe_token_drop_policy: probs
  moe_token_dropping: false
  moe_use_legacy_grouped_gemm: false
  moe_z_loss_coeff: null
  mrope_section: null
  mtp_enabled: false
  mtp_hybrid_override_pattern: null
  mtp_loss_scaling_factor: 0.1
  mtp_num_layers: null
  mtp_standalone: false
  mtp_use_repeated_layer: false
  multi_latent_attention: false
  nccl_all_reduce_for_prefill: false
  no_rope_freq: null
  no_sync_func: null
  normalization: RMSNorm
  num_attention_heads: 16
  num_layers: 48
  num_layers_at_end_in_bf16: 0
  num_layers_at_start_in_bf16: 0
  num_layers_in_first_pipeline_stage: null
  num_layers_in_last_pipeline_stage: null
  num_microbatches_with_partial_activation_checkpoints: null
  num_moe_experts: 512
  num_query_groups: 2
  offload_modules: []
  output_layer_init_method:
    _args_: []
    _partial_: true
    _target_: torch.nn.init.normal_
    mean: 0.0
    std: 0.0020412414523193153
  overlap_moe_expert_parallel_comm: false
  overlap_p2p_comm: false
  overlap_p2p_comm_warmup_flush: false
  parallel_output: true
  param_sync_func: null
  params_dtype:
    _call_: false
    _target_: torch.bfloat16
  perform_initialization: true
  persist_layer_norm: false
  pipeline_dtype:
    _call_: false
    _target_: torch.bfloat16
  pipeline_model_parallel_comm_backend: null
  pipeline_model_parallel_layout: null
  pipeline_model_parallel_size: 1
  position_embedding_type: rope
  qk_clip: false
  qk_clip_alpha: 0.5
  qk_clip_threshold: 100
  qk_l2_norm: false
  qk_layernorm: true
  quant_recipe: null
  recompute_granularity: selective
  recompute_method: null
  recompute_modules:
  - layernorm
  - moe
  - moe_act
  recompute_num_layers: null
  restore_modelopt_state: false
  rope_scaling: false
  rope_scaling_factor: 1.0
  rotary_base: 5000000
  rotary_interleaved: false
  rotary_percent: 0.25
  rotary_scaling_factor: null
  scatter_embedding_sequence_parallel: true
  seq_len_interpolation_factor: null
  seq_length: 2048
  sequence_parallel: false
  share_embeddings_and_output_weights: false
  should_pad_vocab: false
  softmax_scale: null
  softmax_type: vanilla
  symmetric_ar_type: null
  tensor_model_parallel_size: 1
  test_mode: false
  timers:
    _call_: true
    _target_: megatron.core.timers.Timers
  tp_comm_atomic_ag: false
  tp_comm_atomic_rs: false
  tp_comm_bootstrap_backend: nccl
  tp_comm_bulk_dgrad: true
  tp_comm_bulk_wgrad: true
  tp_comm_overlap: false
  tp_comm_overlap_ag: true
  tp_comm_overlap_cfg: null
  tp_comm_overlap_disable_fc1: false
  tp_comm_overlap_disable_qkv: false
  tp_comm_overlap_rs: true
  tp_comm_overlap_rs_dgrad: false
  tp_comm_split_ag: true
  tp_comm_split_rs: true
  tp_only_amax_red: false
  transformer_impl: transformer_engine
  transformer_layer_spec:
    _call_: false
    _target_: megatron.core.models.gpt.experimental_attention_variant_module_specs.get_transformer_block_with_experimental_attention_variant_spec
  use_arbitrary_attention_mask: null
  use_cpu_initialization: false
  use_fused_weighted_squared_relu: false
  use_inference_optimized_layers: false
  use_kitchen: false
  use_kitchen_attention: false
  use_mamba_mem_eff_path: true
  use_ring_exchange_p2p: false
  use_te_activation_func: false
  use_te_rng_tracker: false
  use_transformer_engine_full_layer_spec: false
  use_transformer_engine_op_fuser: false
  variable_seq_lengths: false
  virtual_pipeline_model_parallel_size: null
  vocab_size: 151936
  wgrad_deferral_limit: 0
  window_attn_skip_freq: null
  window_size: null
nvrx_straggler: null
optimizer:
  _target_: megatron.bridge.training.config.OptimizerConfig
  adam_beta1: 0.9
  adam_beta2: 0.98
  adam_eps: 1.0e-08
  apply_wd_to_qk_layernorm: false
  barrier_with_L1_time: false
  bf16: true
  clip_grad: 1.0
  config_logger_dir: ''
  decoupled_lr: null
  decoupled_min_lr: null
  decoupled_weight_decay: true
  exp_avg_dtype:
    _call_: false
    _target_: torch.float32
  exp_avg_sq_dtype:
    _call_: false
    _target_: torch.float32
  fp16: false
  fp8_recipe: tensorwise
  hysteresis: 2
  initial_loss_scale: 4294967296
  log_num_zeros_in_grad: false
  loss_scale: null
  loss_scale_window: 1000
  lr: 0.001
  main_grads_dtype:
    _call_: false
    _target_: torch.float32
  main_params_dtype:
    _call_: false
    _target_: torch.float32
  min_loss_scale: 1.0
  min_lr: 0.0001
  muon_extra_scale_factor: 1.0
  muon_fp32_matmul_prec: medium
  muon_momentum: 0.95
  muon_num_ns_steps: 5
  muon_scale_mode: spectral
  muon_split_qkv: true
  muon_tp_mode: blockwise
  muon_use_nesterov: false
  optimizer: adam
  optimizer_cpu_offload: false
  optimizer_offload_fraction: 0.0
  overlap_cpu_optimizer_d2h_h2d: false
  overlap_param_gather: false
  overlap_param_gather_with_optimizer_step: false
  params_dtype:
    _call_: false
    _target_: torch.bfloat16
  pin_cpu_grads: true
  pin_cpu_params: true
  reuse_grad_buf_for_mxfp8_param_ag: false
  sgd_momentum: 0.9
  store_param_remainders: true
  timers:
    _call_: true
    _target_: megatron.core.timers.Timers
  use_distributed_optimizer: true
  use_precision_aware_optimizer: false
  use_torch_optimizer_for_cpu_offload: false
  weight_decay: 0.1
optimizer_config_override_provider:
  _target_: megatron.bridge.training.config.OptimizerConfigOverrideProvider
peft:
  _target_: megatron.bridge.peft.lora.LoRA
  a2a_experimental: false
  alpha: 64
  canonical_mapping: {}
  dim: 64
  dropout: 0.0
  dropout_position: pre
  exclude_modules: []
  lora_A_init_method: xavier
  lora_B_init_method: zero
  lora_dtype: null
  params_to_save: !!set
    decoder.layers.0.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.0.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.0.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.0.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.0.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.0.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.0.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.0.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.1.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.1.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.1.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.1.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.1.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.1.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.1.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.1.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.10.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.10.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.10.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.10.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.10.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.10.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.10.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.10.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.11.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.11.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.11.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.11.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.11.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.11.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.11.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.11.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.11.self_attention.linear_proj.adapter.linear_in.weight: null
    decoder.layers.11.self_attention.linear_proj.adapter.linear_out.weight: null
    decoder.layers.11.self_attention.linear_qkv.adapter.linear_in.weight: null
    decoder.layers.11.self_attention.linear_qkv.adapter.linear_out.weight: null
    decoder.layers.12.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.12.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.12.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.12.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.12.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.12.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.12.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.12.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.13.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.13.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.13.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.13.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.13.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.13.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.13.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.13.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.14.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.14.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.14.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.14.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.14.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.14.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.14.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.14.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.15.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.15.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.15.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.15.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.15.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.15.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.15.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.15.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.15.self_attention.linear_proj.adapter.linear_in.weight: null
    decoder.layers.15.self_attention.linear_proj.adapter.linear_out.weight: null
    decoder.layers.15.self_attention.linear_qkv.adapter.linear_in.weight: null
    decoder.layers.15.self_attention.linear_qkv.adapter.linear_out.weight: null
    decoder.layers.16.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.16.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.16.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.16.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.16.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.16.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.16.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.16.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.17.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.17.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.17.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.17.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.17.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.17.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.17.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.17.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.18.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.18.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.18.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.18.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.18.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.18.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.18.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.18.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.19.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.19.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.19.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.19.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.19.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.19.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.19.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.19.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.19.self_attention.linear_proj.adapter.linear_in.weight: null
    decoder.layers.19.self_attention.linear_proj.adapter.linear_out.weight: null
    decoder.layers.19.self_attention.linear_qkv.adapter.linear_in.weight: null
    decoder.layers.19.self_attention.linear_qkv.adapter.linear_out.weight: null
    decoder.layers.2.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.2.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.2.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.2.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.2.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.2.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.2.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.2.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.20.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.20.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.20.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.20.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.20.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.20.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.20.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.20.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.21.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.21.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.21.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.21.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.21.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.21.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.21.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.21.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.22.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.22.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.22.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.22.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.22.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.22.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.22.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.22.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.23.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.23.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.23.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.23.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.23.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.23.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.23.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.23.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.23.self_attention.linear_proj.adapter.linear_in.weight: null
    decoder.layers.23.self_attention.linear_proj.adapter.linear_out.weight: null
    decoder.layers.23.self_attention.linear_qkv.adapter.linear_in.weight: null
    decoder.layers.23.self_attention.linear_qkv.adapter.linear_out.weight: null
    decoder.layers.24.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.24.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.24.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.24.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.24.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.24.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.24.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.24.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.25.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.25.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.25.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.25.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.25.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.25.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.25.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.25.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.26.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.26.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.26.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.26.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.26.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.26.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.26.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.26.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.27.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.27.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.27.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.27.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.27.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.27.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.27.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.27.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.27.self_attention.linear_proj.adapter.linear_in.weight: null
    decoder.layers.27.self_attention.linear_proj.adapter.linear_out.weight: null
    decoder.layers.27.self_attention.linear_qkv.adapter.linear_in.weight: null
    decoder.layers.27.self_attention.linear_qkv.adapter.linear_out.weight: null
    decoder.layers.28.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.28.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.28.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.28.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.28.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.28.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.28.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.28.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.29.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.29.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.29.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.29.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.29.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.29.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.29.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.29.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.3.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.3.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.3.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.3.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.3.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.3.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.3.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.3.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.3.self_attention.linear_proj.adapter.linear_in.weight: null
    decoder.layers.3.self_attention.linear_proj.adapter.linear_out.weight: null
    decoder.layers.3.self_attention.linear_qkv.adapter.linear_in.weight: null
    decoder.layers.3.self_attention.linear_qkv.adapter.linear_out.weight: null
    decoder.layers.30.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.30.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.30.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.30.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.30.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.30.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.30.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.30.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.31.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.31.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.31.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.31.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.31.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.31.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.31.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.31.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.31.self_attention.linear_proj.adapter.linear_in.weight: null
    decoder.layers.31.self_attention.linear_proj.adapter.linear_out.weight: null
    decoder.layers.31.self_attention.linear_qkv.adapter.linear_in.weight: null
    decoder.layers.31.self_attention.linear_qkv.adapter.linear_out.weight: null
    decoder.layers.32.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.32.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.32.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.32.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.32.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.32.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.32.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.32.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.33.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.33.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.33.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.33.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.33.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.33.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.33.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.33.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.34.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.34.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.34.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.34.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.34.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.34.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.34.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.34.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.35.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.35.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.35.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.35.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.35.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.35.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.35.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.35.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.35.self_attention.linear_proj.adapter.linear_in.weight: null
    decoder.layers.35.self_attention.linear_proj.adapter.linear_out.weight: null
    decoder.layers.35.self_attention.linear_qkv.adapter.linear_in.weight: null
    decoder.layers.35.self_attention.linear_qkv.adapter.linear_out.weight: null
    decoder.layers.36.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.36.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.36.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.36.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.36.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.36.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.36.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.36.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.37.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.37.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.37.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.37.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.37.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.37.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.37.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.37.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.38.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.38.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.38.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.38.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.38.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.38.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.38.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.38.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.39.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.39.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.39.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.39.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.39.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.39.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.39.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.39.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.39.self_attention.linear_proj.adapter.linear_in.weight: null
    decoder.layers.39.self_attention.linear_proj.adapter.linear_out.weight: null
    decoder.layers.39.self_attention.linear_qkv.adapter.linear_in.weight: null
    decoder.layers.39.self_attention.linear_qkv.adapter.linear_out.weight: null
    decoder.layers.4.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.4.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.4.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.4.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.4.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.4.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.4.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.4.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.40.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.40.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.40.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.40.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.40.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.40.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.40.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.40.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.41.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.41.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.41.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.41.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.41.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.41.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.41.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.41.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.42.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.42.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.42.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.42.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.42.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.42.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.42.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.42.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.43.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.43.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.43.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.43.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.43.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.43.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.43.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.43.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.43.self_attention.linear_proj.adapter.linear_in.weight: null
    decoder.layers.43.self_attention.linear_proj.adapter.linear_out.weight: null
    decoder.layers.43.self_attention.linear_qkv.adapter.linear_in.weight: null
    decoder.layers.43.self_attention.linear_qkv.adapter.linear_out.weight: null
    decoder.layers.44.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.44.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.44.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.44.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.44.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.44.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.44.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.44.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.45.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.45.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.45.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.45.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.45.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.45.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.45.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.45.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.46.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.46.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.46.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.46.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.46.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.46.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.46.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.46.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.47.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.47.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.47.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.47.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.47.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.47.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.47.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.47.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.47.self_attention.linear_proj.adapter.linear_in.weight: null
    decoder.layers.47.self_attention.linear_proj.adapter.linear_out.weight: null
    decoder.layers.47.self_attention.linear_qkv.adapter.linear_in.weight: null
    decoder.layers.47.self_attention.linear_qkv.adapter.linear_out.weight: null
    decoder.layers.5.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.5.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.5.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.5.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.5.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.5.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.5.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.5.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.6.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.6.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.6.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.6.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.6.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.6.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.6.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.6.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.7.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.7.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.7.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.7.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.7.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.7.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.7.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.7.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.7.self_attention.linear_proj.adapter.linear_in.weight: null
    decoder.layers.7.self_attention.linear_proj.adapter.linear_out.weight: null
    decoder.layers.7.self_attention.linear_qkv.adapter.linear_in.weight: null
    decoder.layers.7.self_attention.linear_qkv.adapter.linear_out.weight: null
    decoder.layers.8.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.8.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.8.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.8.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.8.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.8.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.8.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.8.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.9.mlp.experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.9.mlp.experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.9.mlp.experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.9.mlp.experts.linear_fc2.adapter.linear_out.weight: null
    decoder.layers.9.mlp.shared_experts.linear_fc1.adapter.linear_in.weight: null
    decoder.layers.9.mlp.shared_experts.linear_fc1.adapter.linear_out.weight: null
    decoder.layers.9.mlp.shared_experts.linear_fc2.adapter.linear_in.weight: null
    decoder.layers.9.mlp.shared_experts.linear_fc2.adapter.linear_out.weight: null
  target_modules:
  - linear_qkv
  - linear_proj
  - linear_fc1
  - linear_fc2
profiling:
  _target_: megatron.bridge.training.config.ProfilingConfig
  memory_snapshot_path: snapshot.pickle
  nvtx_ranges: false
  profile_ranks: []
  profile_step_end: 12
  profile_step_start: 10
  pytorch_profiler_collect_callstack: false
  pytorch_profiler_collect_chakra: false
  pytorch_profiler_collect_shapes: false
  record_memory_history: false
  record_shapes: false
  use_nsys_profiler: false
  use_pytorch_profiler: false
rerun_state_machine:
  _target_: megatron.bridge.training.config.RerunStateMachineConfig
  check_for_nan_in_loss: true
  check_for_spiky_loss: false
  error_injection_rate: 0
  error_injection_type: transient_error
  rerun_mode: disabled
  spiky_loss_factor: 10.0
rng:
  _target_: megatron.bridge.training.config.RNGConfig
  data_parallel_random_init: false
  inference_rng_tracker: false
  seed: 5678
  te_rng_tracker: false
scheduler:
  _target_: megatron.bridge.training.config.SchedulerConfig
  end_weight_decay: 0.033
  lr_decay_iters: 2000
  lr_decay_samples: null
  lr_decay_steps: 64000
  lr_decay_style: cosine
  lr_warmup_fraction: null
  lr_warmup_init: 0.0
  lr_warmup_iters: 50
  lr_warmup_samples: 0
  lr_warmup_steps: 1600
  lr_wsd_decay_iters: null
  lr_wsd_decay_samples: null
  lr_wsd_decay_style: exponential
  no_weight_decay_cond_type: qwen3_next
  override_opt_param_scheduler: true
  start_weight_decay: 0.033
  use_checkpoint_opt_param_scheduler: false
  wd_incr_steps: 64000
  weight_decay_incr_style: constant
  wsd_decay_steps: null
straggler: null
tensor_inspect: null
tokenizer:
  _target_: megatron.bridge.training.tokenizers.config.TokenizerConfig
  chat_template: null
  force_system_message: false
  hf_tokenizer_kwargs: {}
  image_tag_type: null
  merge_file: null
  metadata_path: null
  sp_tokenizer_kwargs: {}
  special_tokens: null
  tiktoken_num_special_tokens: 1000
  tiktoken_pattern: null
  tiktoken_special_tokens: null
  tokenizer_model: Qwen/Qwen3-Coder-Next-Base
  tokenizer_prompt_format: null
  tokenizer_type: HuggingFaceTokenizer
  vocab_extra_ids: 0
  vocab_file: null
  vocab_size: null
train:
  _target_: megatron.bridge.training.config.TrainingConfig
  check_optimizer_step_success: true
  check_weight_hash_across_dp_replicas_interval: null
  decrease_batch_size_if_needed: false
  empty_unused_memory_level: 0
  eval_interval: null
  eval_iters: null
  exit_duration_in_mins: null
  exit_interval: null
  exit_signal:
    _args_:
    - 15
    _call_: true
    _name_: SIGTERM
    _target_: signal.Signals
  exit_signal_handler: false
  exit_signal_handler_for_dataloader: false
  global_batch_size: 32
  iterations_to_skip: []
  manual_gc: true
  manual_gc_eval: 100
  manual_gc_interval: 100
  micro_batch_size: 1
  rampup_batch_size: null
  skip_sync_grad_norm_across_mp: false
  skip_train: null
  train_iters: 2000
  train_samples: null
  train_sync_interval: null
validation:
  _target_: megatron.bridge.training.config.ValidationConfig
  eval_interval: 9999
  eval_iters: 32
  skip_train: false

Deploy This Model

Production-ready deployment in minutes

Together.ai

Instant API access to this model

Fastest API

Production-ready inference API. Start free, scale to millions.

Try Free API

Replicate

One-click model deployment

Easiest Setup

Run models in the cloud with simple API. No DevOps required.

Deploy Now

Disclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.