step3-tiny-random-vllm

56
3.0B
by
yujiepan
Language Model
OTHER
3B params
New
56 downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
7GB+ RAM
Mobile
Laptop
Server
Quick Summary

This tiny model is for debugging.

Device Compatibility

Mobile
4-6GB RAM
Laptop
16GB RAM
Server
GPU
Minimum Recommended
3GB+ RAM

Code Examples

Printing the model:text
Step3vForConditionalGeneration(
  (model): Step3vModel(
    (vision_model): StepCLIPVisionTransformer(
      (embeddings): StepCLIPVisionEmbeddings(
        (patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
        (position_embedding): Embedding(2705, 64)
      )
      (transformer): StepCLIPEncoder(
        (layers): ModuleList(
          (0-1): 2 x StepCLIPEncoderLayer(
            (layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (self_attn): StepCLIPAttention(
              (qkv_proj): Linear(in_features=64, out_features=192, bias=True)
              (out_proj): Linear(in_features=64, out_features=64, bias=True)
            )
            (mlp): StepCLIPMLP(
              (fc1): Linear(in_features=64, out_features=128, bias=True)
              (act): QuickGELUActivation()
              (fc2): Linear(in_features=128, out_features=64, bias=True)
            )
          )
        )
      )
    )
    (language_model): Step3Model(
      (embed_tokens): Embedding(128815, 32)
      (layers): ModuleList(
        (0): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (mlp): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
        (1): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (moe): Step3vMoEMLP(
            (gate): Linear(in_features=32, out_features=8, bias=False)
            (up_proj): MoELinear()
            (gate_proj): MoELinear()
            (down_proj): MoELinear()
            (act_fn): SiLU()
          )
          (share_expert): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
      )
      (norm): Step3vRMSNorm((32,), eps=1e-05)
      (rotary_emb): Step3vRotaryEmbedding()
    )
    (vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
    (vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
  )
  (lm_head): Linear(in_features=32, out_features=128815, bias=False)
)
Printing the model:text
Step3vForConditionalGeneration(
  (model): Step3vModel(
    (vision_model): StepCLIPVisionTransformer(
      (embeddings): StepCLIPVisionEmbeddings(
        (patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
        (position_embedding): Embedding(2705, 64)
      )
      (transformer): StepCLIPEncoder(
        (layers): ModuleList(
          (0-1): 2 x StepCLIPEncoderLayer(
            (layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (self_attn): StepCLIPAttention(
              (qkv_proj): Linear(in_features=64, out_features=192, bias=True)
              (out_proj): Linear(in_features=64, out_features=64, bias=True)
            )
            (mlp): StepCLIPMLP(
              (fc1): Linear(in_features=64, out_features=128, bias=True)
              (act): QuickGELUActivation()
              (fc2): Linear(in_features=128, out_features=64, bias=True)
            )
          )
        )
      )
    )
    (language_model): Step3Model(
      (embed_tokens): Embedding(128815, 32)
      (layers): ModuleList(
        (0): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (mlp): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
        (1): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (moe): Step3vMoEMLP(
            (gate): Linear(in_features=32, out_features=8, bias=False)
            (up_proj): MoELinear()
            (gate_proj): MoELinear()
            (down_proj): MoELinear()
            (act_fn): SiLU()
          )
          (share_expert): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
      )
      (norm): Step3vRMSNorm((32,), eps=1e-05)
      (rotary_emb): Step3vRotaryEmbedding()
    )
    (vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
    (vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
  )
  (lm_head): Linear(in_features=32, out_features=128815, bias=False)
)
Printing the model:text
Step3vForConditionalGeneration(
  (model): Step3vModel(
    (vision_model): StepCLIPVisionTransformer(
      (embeddings): StepCLIPVisionEmbeddings(
        (patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
        (position_embedding): Embedding(2705, 64)
      )
      (transformer): StepCLIPEncoder(
        (layers): ModuleList(
          (0-1): 2 x StepCLIPEncoderLayer(
            (layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (self_attn): StepCLIPAttention(
              (qkv_proj): Linear(in_features=64, out_features=192, bias=True)
              (out_proj): Linear(in_features=64, out_features=64, bias=True)
            )
            (mlp): StepCLIPMLP(
              (fc1): Linear(in_features=64, out_features=128, bias=True)
              (act): QuickGELUActivation()
              (fc2): Linear(in_features=128, out_features=64, bias=True)
            )
          )
        )
      )
    )
    (language_model): Step3Model(
      (embed_tokens): Embedding(128815, 32)
      (layers): ModuleList(
        (0): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (mlp): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
        (1): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (moe): Step3vMoEMLP(
            (gate): Linear(in_features=32, out_features=8, bias=False)
            (up_proj): MoELinear()
            (gate_proj): MoELinear()
            (down_proj): MoELinear()
            (act_fn): SiLU()
          )
          (share_expert): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
      )
      (norm): Step3vRMSNorm((32,), eps=1e-05)
      (rotary_emb): Step3vRotaryEmbedding()
    )
    (vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
    (vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
  )
  (lm_head): Linear(in_features=32, out_features=128815, bias=False)
)
Printing the model:text
Step3vForConditionalGeneration(
  (model): Step3vModel(
    (vision_model): StepCLIPVisionTransformer(
      (embeddings): StepCLIPVisionEmbeddings(
        (patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
        (position_embedding): Embedding(2705, 64)
      )
      (transformer): StepCLIPEncoder(
        (layers): ModuleList(
          (0-1): 2 x StepCLIPEncoderLayer(
            (layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (self_attn): StepCLIPAttention(
              (qkv_proj): Linear(in_features=64, out_features=192, bias=True)
              (out_proj): Linear(in_features=64, out_features=64, bias=True)
            )
            (mlp): StepCLIPMLP(
              (fc1): Linear(in_features=64, out_features=128, bias=True)
              (act): QuickGELUActivation()
              (fc2): Linear(in_features=128, out_features=64, bias=True)
            )
          )
        )
      )
    )
    (language_model): Step3Model(
      (embed_tokens): Embedding(128815, 32)
      (layers): ModuleList(
        (0): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (mlp): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
        (1): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (moe): Step3vMoEMLP(
            (gate): Linear(in_features=32, out_features=8, bias=False)
            (up_proj): MoELinear()
            (gate_proj): MoELinear()
            (down_proj): MoELinear()
            (act_fn): SiLU()
          )
          (share_expert): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
      )
      (norm): Step3vRMSNorm((32,), eps=1e-05)
      (rotary_emb): Step3vRotaryEmbedding()
    )
    (vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
    (vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
  )
  (lm_head): Linear(in_features=32, out_features=128815, bias=False)
)
Printing the model:text
Step3vForConditionalGeneration(
  (model): Step3vModel(
    (vision_model): StepCLIPVisionTransformer(
      (embeddings): StepCLIPVisionEmbeddings(
        (patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
        (position_embedding): Embedding(2705, 64)
      )
      (transformer): StepCLIPEncoder(
        (layers): ModuleList(
          (0-1): 2 x StepCLIPEncoderLayer(
            (layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (self_attn): StepCLIPAttention(
              (qkv_proj): Linear(in_features=64, out_features=192, bias=True)
              (out_proj): Linear(in_features=64, out_features=64, bias=True)
            )
            (mlp): StepCLIPMLP(
              (fc1): Linear(in_features=64, out_features=128, bias=True)
              (act): QuickGELUActivation()
              (fc2): Linear(in_features=128, out_features=64, bias=True)
            )
          )
        )
      )
    )
    (language_model): Step3Model(
      (embed_tokens): Embedding(128815, 32)
      (layers): ModuleList(
        (0): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (mlp): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
        (1): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (moe): Step3vMoEMLP(
            (gate): Linear(in_features=32, out_features=8, bias=False)
            (up_proj): MoELinear()
            (gate_proj): MoELinear()
            (down_proj): MoELinear()
            (act_fn): SiLU()
          )
          (share_expert): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
      )
      (norm): Step3vRMSNorm((32,), eps=1e-05)
      (rotary_emb): Step3vRotaryEmbedding()
    )
    (vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
    (vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
  )
  (lm_head): Linear(in_features=32, out_features=128815, bias=False)
)
Printing the model:text
Step3vForConditionalGeneration(
  (model): Step3vModel(
    (vision_model): StepCLIPVisionTransformer(
      (embeddings): StepCLIPVisionEmbeddings(
        (patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
        (position_embedding): Embedding(2705, 64)
      )
      (transformer): StepCLIPEncoder(
        (layers): ModuleList(
          (0-1): 2 x StepCLIPEncoderLayer(
            (layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (self_attn): StepCLIPAttention(
              (qkv_proj): Linear(in_features=64, out_features=192, bias=True)
              (out_proj): Linear(in_features=64, out_features=64, bias=True)
            )
            (mlp): StepCLIPMLP(
              (fc1): Linear(in_features=64, out_features=128, bias=True)
              (act): QuickGELUActivation()
              (fc2): Linear(in_features=128, out_features=64, bias=True)
            )
          )
        )
      )
    )
    (language_model): Step3Model(
      (embed_tokens): Embedding(128815, 32)
      (layers): ModuleList(
        (0): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (mlp): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
        (1): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (moe): Step3vMoEMLP(
            (gate): Linear(in_features=32, out_features=8, bias=False)
            (up_proj): MoELinear()
            (gate_proj): MoELinear()
            (down_proj): MoELinear()
            (act_fn): SiLU()
          )
          (share_expert): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
      )
      (norm): Step3vRMSNorm((32,), eps=1e-05)
      (rotary_emb): Step3vRotaryEmbedding()
    )
    (vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
    (vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
  )
  (lm_head): Linear(in_features=32, out_features=128815, bias=False)
)
Printing the model:text
Step3vForConditionalGeneration(
  (model): Step3vModel(
    (vision_model): StepCLIPVisionTransformer(
      (embeddings): StepCLIPVisionEmbeddings(
        (patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
        (position_embedding): Embedding(2705, 64)
      )
      (transformer): StepCLIPEncoder(
        (layers): ModuleList(
          (0-1): 2 x StepCLIPEncoderLayer(
            (layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (self_attn): StepCLIPAttention(
              (qkv_proj): Linear(in_features=64, out_features=192, bias=True)
              (out_proj): Linear(in_features=64, out_features=64, bias=True)
            )
            (mlp): StepCLIPMLP(
              (fc1): Linear(in_features=64, out_features=128, bias=True)
              (act): QuickGELUActivation()
              (fc2): Linear(in_features=128, out_features=64, bias=True)
            )
          )
        )
      )
    )
    (language_model): Step3Model(
      (embed_tokens): Embedding(128815, 32)
      (layers): ModuleList(
        (0): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (mlp): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
        (1): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (moe): Step3vMoEMLP(
            (gate): Linear(in_features=32, out_features=8, bias=False)
            (up_proj): MoELinear()
            (gate_proj): MoELinear()
            (down_proj): MoELinear()
            (act_fn): SiLU()
          )
          (share_expert): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
      )
      (norm): Step3vRMSNorm((32,), eps=1e-05)
      (rotary_emb): Step3vRotaryEmbedding()
    )
    (vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
    (vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
  )
  (lm_head): Linear(in_features=32, out_features=128815, bias=False)
)
Printing the model:text
Step3vForConditionalGeneration(
  (model): Step3vModel(
    (vision_model): StepCLIPVisionTransformer(
      (embeddings): StepCLIPVisionEmbeddings(
        (patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
        (position_embedding): Embedding(2705, 64)
      )
      (transformer): StepCLIPEncoder(
        (layers): ModuleList(
          (0-1): 2 x StepCLIPEncoderLayer(
            (layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (self_attn): StepCLIPAttention(
              (qkv_proj): Linear(in_features=64, out_features=192, bias=True)
              (out_proj): Linear(in_features=64, out_features=64, bias=True)
            )
            (mlp): StepCLIPMLP(
              (fc1): Linear(in_features=64, out_features=128, bias=True)
              (act): QuickGELUActivation()
              (fc2): Linear(in_features=128, out_features=64, bias=True)
            )
          )
        )
      )
    )
    (language_model): Step3Model(
      (embed_tokens): Embedding(128815, 32)
      (layers): ModuleList(
        (0): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (mlp): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
        (1): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (moe): Step3vMoEMLP(
            (gate): Linear(in_features=32, out_features=8, bias=False)
            (up_proj): MoELinear()
            (gate_proj): MoELinear()
            (down_proj): MoELinear()
            (act_fn): SiLU()
          )
          (share_expert): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
      )
      (norm): Step3vRMSNorm((32,), eps=1e-05)
      (rotary_emb): Step3vRotaryEmbedding()
    )
    (vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
    (vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
  )
  (lm_head): Linear(in_features=32, out_features=128815, bias=False)
)
Printing the model:text
Step3vForConditionalGeneration(
  (model): Step3vModel(
    (vision_model): StepCLIPVisionTransformer(
      (embeddings): StepCLIPVisionEmbeddings(
        (patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
        (position_embedding): Embedding(2705, 64)
      )
      (transformer): StepCLIPEncoder(
        (layers): ModuleList(
          (0-1): 2 x StepCLIPEncoderLayer(
            (layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (self_attn): StepCLIPAttention(
              (qkv_proj): Linear(in_features=64, out_features=192, bias=True)
              (out_proj): Linear(in_features=64, out_features=64, bias=True)
            )
            (mlp): StepCLIPMLP(
              (fc1): Linear(in_features=64, out_features=128, bias=True)
              (act): QuickGELUActivation()
              (fc2): Linear(in_features=128, out_features=64, bias=True)
            )
          )
        )
      )
    )
    (language_model): Step3Model(
      (embed_tokens): Embedding(128815, 32)
      (layers): ModuleList(
        (0): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (mlp): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
        (1): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (moe): Step3vMoEMLP(
            (gate): Linear(in_features=32, out_features=8, bias=False)
            (up_proj): MoELinear()
            (gate_proj): MoELinear()
            (down_proj): MoELinear()
            (act_fn): SiLU()
          )
          (share_expert): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
      )
      (norm): Step3vRMSNorm((32,), eps=1e-05)
      (rotary_emb): Step3vRotaryEmbedding()
    )
    (vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
    (vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
  )
  (lm_head): Linear(in_features=32, out_features=128815, bias=False)
)
Printing the model:text
Step3vForConditionalGeneration(
  (model): Step3vModel(
    (vision_model): StepCLIPVisionTransformer(
      (embeddings): StepCLIPVisionEmbeddings(
        (patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
        (position_embedding): Embedding(2705, 64)
      )
      (transformer): StepCLIPEncoder(
        (layers): ModuleList(
          (0-1): 2 x StepCLIPEncoderLayer(
            (layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (self_attn): StepCLIPAttention(
              (qkv_proj): Linear(in_features=64, out_features=192, bias=True)
              (out_proj): Linear(in_features=64, out_features=64, bias=True)
            )
            (mlp): StepCLIPMLP(
              (fc1): Linear(in_features=64, out_features=128, bias=True)
              (act): QuickGELUActivation()
              (fc2): Linear(in_features=128, out_features=64, bias=True)
            )
          )
        )
      )
    )
    (language_model): Step3Model(
      (embed_tokens): Embedding(128815, 32)
      (layers): ModuleList(
        (0): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (mlp): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
        (1): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (moe): Step3vMoEMLP(
            (gate): Linear(in_features=32, out_features=8, bias=False)
            (up_proj): MoELinear()
            (gate_proj): MoELinear()
            (down_proj): MoELinear()
            (act_fn): SiLU()
          )
          (share_expert): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
      )
      (norm): Step3vRMSNorm((32,), eps=1e-05)
      (rotary_emb): Step3vRotaryEmbedding()
    )
    (vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
    (vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
  )
  (lm_head): Linear(in_features=32, out_features=128815, bias=False)
)
Printing the model:text
Step3vForConditionalGeneration(
  (model): Step3vModel(
    (vision_model): StepCLIPVisionTransformer(
      (embeddings): StepCLIPVisionEmbeddings(
        (patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
        (position_embedding): Embedding(2705, 64)
      )
      (transformer): StepCLIPEncoder(
        (layers): ModuleList(
          (0-1): 2 x StepCLIPEncoderLayer(
            (layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (self_attn): StepCLIPAttention(
              (qkv_proj): Linear(in_features=64, out_features=192, bias=True)
              (out_proj): Linear(in_features=64, out_features=64, bias=True)
            )
            (mlp): StepCLIPMLP(
              (fc1): Linear(in_features=64, out_features=128, bias=True)
              (act): QuickGELUActivation()
              (fc2): Linear(in_features=128, out_features=64, bias=True)
            )
          )
        )
      )
    )
    (language_model): Step3Model(
      (embed_tokens): Embedding(128815, 32)
      (layers): ModuleList(
        (0): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (mlp): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
        (1): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (moe): Step3vMoEMLP(
            (gate): Linear(in_features=32, out_features=8, bias=False)
            (up_proj): MoELinear()
            (gate_proj): MoELinear()
            (down_proj): MoELinear()
            (act_fn): SiLU()
          )
          (share_expert): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
      )
      (norm): Step3vRMSNorm((32,), eps=1e-05)
      (rotary_emb): Step3vRotaryEmbedding()
    )
    (vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
    (vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
  )
  (lm_head): Linear(in_features=32, out_features=128815, bias=False)
)
Printing the model:text
Step3vForConditionalGeneration(
  (model): Step3vModel(
    (vision_model): StepCLIPVisionTransformer(
      (embeddings): StepCLIPVisionEmbeddings(
        (patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
        (position_embedding): Embedding(2705, 64)
      )
      (transformer): StepCLIPEncoder(
        (layers): ModuleList(
          (0-1): 2 x StepCLIPEncoderLayer(
            (layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (self_attn): StepCLIPAttention(
              (qkv_proj): Linear(in_features=64, out_features=192, bias=True)
              (out_proj): Linear(in_features=64, out_features=64, bias=True)
            )
            (mlp): StepCLIPMLP(
              (fc1): Linear(in_features=64, out_features=128, bias=True)
              (act): QuickGELUActivation()
              (fc2): Linear(in_features=128, out_features=64, bias=True)
            )
          )
        )
      )
    )
    (language_model): Step3Model(
      (embed_tokens): Embedding(128815, 32)
      (layers): ModuleList(
        (0): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (mlp): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
        (1): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (moe): Step3vMoEMLP(
            (gate): Linear(in_features=32, out_features=8, bias=False)
            (up_proj): MoELinear()
            (gate_proj): MoELinear()
            (down_proj): MoELinear()
            (act_fn): SiLU()
          )
          (share_expert): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
      )
      (norm): Step3vRMSNorm((32,), eps=1e-05)
      (rotary_emb): Step3vRotaryEmbedding()
    )
    (vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
    (vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
  )
  (lm_head): Linear(in_features=32, out_features=128815, bias=False)
)
Printing the model:text
Step3vForConditionalGeneration(
  (model): Step3vModel(
    (vision_model): StepCLIPVisionTransformer(
      (embeddings): StepCLIPVisionEmbeddings(
        (patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
        (position_embedding): Embedding(2705, 64)
      )
      (transformer): StepCLIPEncoder(
        (layers): ModuleList(
          (0-1): 2 x StepCLIPEncoderLayer(
            (layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (self_attn): StepCLIPAttention(
              (qkv_proj): Linear(in_features=64, out_features=192, bias=True)
              (out_proj): Linear(in_features=64, out_features=64, bias=True)
            )
            (mlp): StepCLIPMLP(
              (fc1): Linear(in_features=64, out_features=128, bias=True)
              (act): QuickGELUActivation()
              (fc2): Linear(in_features=128, out_features=64, bias=True)
            )
          )
        )
      )
    )
    (language_model): Step3Model(
      (embed_tokens): Embedding(128815, 32)
      (layers): ModuleList(
        (0): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (mlp): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
        (1): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (moe): Step3vMoEMLP(
            (gate): Linear(in_features=32, out_features=8, bias=False)
            (up_proj): MoELinear()
            (gate_proj): MoELinear()
            (down_proj): MoELinear()
            (act_fn): SiLU()
          )
          (share_expert): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
      )
      (norm): Step3vRMSNorm((32,), eps=1e-05)
      (rotary_emb): Step3vRotaryEmbedding()
    )
    (vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
    (vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
  )
  (lm_head): Linear(in_features=32, out_features=128815, bias=False)
)
Printing the model:text
Step3vForConditionalGeneration(
  (model): Step3vModel(
    (vision_model): StepCLIPVisionTransformer(
      (embeddings): StepCLIPVisionEmbeddings(
        (patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
        (position_embedding): Embedding(2705, 64)
      )
      (transformer): StepCLIPEncoder(
        (layers): ModuleList(
          (0-1): 2 x StepCLIPEncoderLayer(
            (layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (self_attn): StepCLIPAttention(
              (qkv_proj): Linear(in_features=64, out_features=192, bias=True)
              (out_proj): Linear(in_features=64, out_features=64, bias=True)
            )
            (mlp): StepCLIPMLP(
              (fc1): Linear(in_features=64, out_features=128, bias=True)
              (act): QuickGELUActivation()
              (fc2): Linear(in_features=128, out_features=64, bias=True)
            )
          )
        )
      )
    )
    (language_model): Step3Model(
      (embed_tokens): Embedding(128815, 32)
      (layers): ModuleList(
        (0): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (mlp): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
        (1): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (moe): Step3vMoEMLP(
            (gate): Linear(in_features=32, out_features=8, bias=False)
            (up_proj): MoELinear()
            (gate_proj): MoELinear()
            (down_proj): MoELinear()
            (act_fn): SiLU()
          )
          (share_expert): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
      )
      (norm): Step3vRMSNorm((32,), eps=1e-05)
      (rotary_emb): Step3vRotaryEmbedding()
    )
    (vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
    (vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
  )
  (lm_head): Linear(in_features=32, out_features=128815, bias=False)
)
Printing the model:text
Step3vForConditionalGeneration(
  (model): Step3vModel(
    (vision_model): StepCLIPVisionTransformer(
      (embeddings): StepCLIPVisionEmbeddings(
        (patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
        (position_embedding): Embedding(2705, 64)
      )
      (transformer): StepCLIPEncoder(
        (layers): ModuleList(
          (0-1): 2 x StepCLIPEncoderLayer(
            (layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (self_attn): StepCLIPAttention(
              (qkv_proj): Linear(in_features=64, out_features=192, bias=True)
              (out_proj): Linear(in_features=64, out_features=64, bias=True)
            )
            (mlp): StepCLIPMLP(
              (fc1): Linear(in_features=64, out_features=128, bias=True)
              (act): QuickGELUActivation()
              (fc2): Linear(in_features=128, out_features=64, bias=True)
            )
          )
        )
      )
    )
    (language_model): Step3Model(
      (embed_tokens): Embedding(128815, 32)
      (layers): ModuleList(
        (0): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (mlp): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
        (1): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (moe): Step3vMoEMLP(
            (gate): Linear(in_features=32, out_features=8, bias=False)
            (up_proj): MoELinear()
            (gate_proj): MoELinear()
            (down_proj): MoELinear()
            (act_fn): SiLU()
          )
          (share_expert): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
      )
      (norm): Step3vRMSNorm((32,), eps=1e-05)
      (rotary_emb): Step3vRotaryEmbedding()
    )
    (vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
    (vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
  )
  (lm_head): Linear(in_features=32, out_features=128815, bias=False)
)
Printing the model:text
Step3vForConditionalGeneration(
  (model): Step3vModel(
    (vision_model): StepCLIPVisionTransformer(
      (embeddings): StepCLIPVisionEmbeddings(
        (patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
        (position_embedding): Embedding(2705, 64)
      )
      (transformer): StepCLIPEncoder(
        (layers): ModuleList(
          (0-1): 2 x StepCLIPEncoderLayer(
            (layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (self_attn): StepCLIPAttention(
              (qkv_proj): Linear(in_features=64, out_features=192, bias=True)
              (out_proj): Linear(in_features=64, out_features=64, bias=True)
            )
            (mlp): StepCLIPMLP(
              (fc1): Linear(in_features=64, out_features=128, bias=True)
              (act): QuickGELUActivation()
              (fc2): Linear(in_features=128, out_features=64, bias=True)
            )
          )
        )
      )
    )
    (language_model): Step3Model(
      (embed_tokens): Embedding(128815, 32)
      (layers): ModuleList(
        (0): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (mlp): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
        (1): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (moe): Step3vMoEMLP(
            (gate): Linear(in_features=32, out_features=8, bias=False)
            (up_proj): MoELinear()
            (gate_proj): MoELinear()
            (down_proj): MoELinear()
            (act_fn): SiLU()
          )
          (share_expert): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
      )
      (norm): Step3vRMSNorm((32,), eps=1e-05)
      (rotary_emb): Step3vRotaryEmbedding()
    )
    (vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
    (vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
  )
  (lm_head): Linear(in_features=32, out_features=128815, bias=False)
)
Printing the model:text
Step3vForConditionalGeneration(
  (model): Step3vModel(
    (vision_model): StepCLIPVisionTransformer(
      (embeddings): StepCLIPVisionEmbeddings(
        (patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
        (position_embedding): Embedding(2705, 64)
      )
      (transformer): StepCLIPEncoder(
        (layers): ModuleList(
          (0-1): 2 x StepCLIPEncoderLayer(
            (layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (self_attn): StepCLIPAttention(
              (qkv_proj): Linear(in_features=64, out_features=192, bias=True)
              (out_proj): Linear(in_features=64, out_features=64, bias=True)
            )
            (mlp): StepCLIPMLP(
              (fc1): Linear(in_features=64, out_features=128, bias=True)
              (act): QuickGELUActivation()
              (fc2): Linear(in_features=128, out_features=64, bias=True)
            )
          )
        )
      )
    )
    (language_model): Step3Model(
      (embed_tokens): Embedding(128815, 32)
      (layers): ModuleList(
        (0): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (mlp): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
        (1): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (moe): Step3vMoEMLP(
            (gate): Linear(in_features=32, out_features=8, bias=False)
            (up_proj): MoELinear()
            (gate_proj): MoELinear()
            (down_proj): MoELinear()
            (act_fn): SiLU()
          )
          (share_expert): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
      )
      (norm): Step3vRMSNorm((32,), eps=1e-05)
      (rotary_emb): Step3vRotaryEmbedding()
    )
    (vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
    (vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
  )
  (lm_head): Linear(in_features=32, out_features=128815, bias=False)
)
Printing the model:text
Step3vForConditionalGeneration(
  (model): Step3vModel(
    (vision_model): StepCLIPVisionTransformer(
      (embeddings): StepCLIPVisionEmbeddings(
        (patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
        (position_embedding): Embedding(2705, 64)
      )
      (transformer): StepCLIPEncoder(
        (layers): ModuleList(
          (0-1): 2 x StepCLIPEncoderLayer(
            (layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (self_attn): StepCLIPAttention(
              (qkv_proj): Linear(in_features=64, out_features=192, bias=True)
              (out_proj): Linear(in_features=64, out_features=64, bias=True)
            )
            (mlp): StepCLIPMLP(
              (fc1): Linear(in_features=64, out_features=128, bias=True)
              (act): QuickGELUActivation()
              (fc2): Linear(in_features=128, out_features=64, bias=True)
            )
          )
        )
      )
    )
    (language_model): Step3Model(
      (embed_tokens): Embedding(128815, 32)
      (layers): ModuleList(
        (0): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (mlp): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
        (1): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (moe): Step3vMoEMLP(
            (gate): Linear(in_features=32, out_features=8, bias=False)
            (up_proj): MoELinear()
            (gate_proj): MoELinear()
            (down_proj): MoELinear()
            (act_fn): SiLU()
          )
          (share_expert): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
      )
      (norm): Step3vRMSNorm((32,), eps=1e-05)
      (rotary_emb): Step3vRotaryEmbedding()
    )
    (vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
    (vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
  )
  (lm_head): Linear(in_features=32, out_features=128815, bias=False)
)
Printing the model:text
Step3vForConditionalGeneration(
  (model): Step3vModel(
    (vision_model): StepCLIPVisionTransformer(
      (embeddings): StepCLIPVisionEmbeddings(
        (patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
        (position_embedding): Embedding(2705, 64)
      )
      (transformer): StepCLIPEncoder(
        (layers): ModuleList(
          (0-1): 2 x StepCLIPEncoderLayer(
            (layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (self_attn): StepCLIPAttention(
              (qkv_proj): Linear(in_features=64, out_features=192, bias=True)
              (out_proj): Linear(in_features=64, out_features=64, bias=True)
            )
            (mlp): StepCLIPMLP(
              (fc1): Linear(in_features=64, out_features=128, bias=True)
              (act): QuickGELUActivation()
              (fc2): Linear(in_features=128, out_features=64, bias=True)
            )
          )
        )
      )
    )
    (language_model): Step3Model(
      (embed_tokens): Embedding(128815, 32)
      (layers): ModuleList(
        (0): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (mlp): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
        (1): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (moe): Step3vMoEMLP(
            (gate): Linear(in_features=32, out_features=8, bias=False)
            (up_proj): MoELinear()
            (gate_proj): MoELinear()
            (down_proj): MoELinear()
            (act_fn): SiLU()
          )
          (share_expert): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
      )
      (norm): Step3vRMSNorm((32,), eps=1e-05)
      (rotary_emb): Step3vRotaryEmbedding()
    )
    (vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
    (vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
  )
  (lm_head): Linear(in_features=32, out_features=128815, bias=False)
)
Printing the model:text
Step3vForConditionalGeneration(
  (model): Step3vModel(
    (vision_model): StepCLIPVisionTransformer(
      (embeddings): StepCLIPVisionEmbeddings(
        (patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
        (position_embedding): Embedding(2705, 64)
      )
      (transformer): StepCLIPEncoder(
        (layers): ModuleList(
          (0-1): 2 x StepCLIPEncoderLayer(
            (layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (self_attn): StepCLIPAttention(
              (qkv_proj): Linear(in_features=64, out_features=192, bias=True)
              (out_proj): Linear(in_features=64, out_features=64, bias=True)
            )
            (mlp): StepCLIPMLP(
              (fc1): Linear(in_features=64, out_features=128, bias=True)
              (act): QuickGELUActivation()
              (fc2): Linear(in_features=128, out_features=64, bias=True)
            )
          )
        )
      )
    )
    (language_model): Step3Model(
      (embed_tokens): Embedding(128815, 32)
      (layers): ModuleList(
        (0): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (mlp): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
        (1): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (moe): Step3vMoEMLP(
            (gate): Linear(in_features=32, out_features=8, bias=False)
            (up_proj): MoELinear()
            (gate_proj): MoELinear()
            (down_proj): MoELinear()
            (act_fn): SiLU()
          )
          (share_expert): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
      )
      (norm): Step3vRMSNorm((32,), eps=1e-05)
      (rotary_emb): Step3vRotaryEmbedding()
    )
    (vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
    (vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
  )
  (lm_head): Linear(in_features=32, out_features=128815, bias=False)
)
Printing the model:text
Step3vForConditionalGeneration(
  (model): Step3vModel(
    (vision_model): StepCLIPVisionTransformer(
      (embeddings): StepCLIPVisionEmbeddings(
        (patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
        (position_embedding): Embedding(2705, 64)
      )
      (transformer): StepCLIPEncoder(
        (layers): ModuleList(
          (0-1): 2 x StepCLIPEncoderLayer(
            (layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (self_attn): StepCLIPAttention(
              (qkv_proj): Linear(in_features=64, out_features=192, bias=True)
              (out_proj): Linear(in_features=64, out_features=64, bias=True)
            )
            (mlp): StepCLIPMLP(
              (fc1): Linear(in_features=64, out_features=128, bias=True)
              (act): QuickGELUActivation()
              (fc2): Linear(in_features=128, out_features=64, bias=True)
            )
          )
        )
      )
    )
    (language_model): Step3Model(
      (embed_tokens): Embedding(128815, 32)
      (layers): ModuleList(
        (0): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (mlp): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
        (1): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (moe): Step3vMoEMLP(
            (gate): Linear(in_features=32, out_features=8, bias=False)
            (up_proj): MoELinear()
            (gate_proj): MoELinear()
            (down_proj): MoELinear()
            (act_fn): SiLU()
          )
          (share_expert): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
      )
      (norm): Step3vRMSNorm((32,), eps=1e-05)
      (rotary_emb): Step3vRotaryEmbedding()
    )
    (vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
    (vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
  )
  (lm_head): Linear(in_features=32, out_features=128815, bias=False)
)
Printing the model:text
Step3vForConditionalGeneration(
  (model): Step3vModel(
    (vision_model): StepCLIPVisionTransformer(
      (embeddings): StepCLIPVisionEmbeddings(
        (patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
        (position_embedding): Embedding(2705, 64)
      )
      (transformer): StepCLIPEncoder(
        (layers): ModuleList(
          (0-1): 2 x StepCLIPEncoderLayer(
            (layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (self_attn): StepCLIPAttention(
              (qkv_proj): Linear(in_features=64, out_features=192, bias=True)
              (out_proj): Linear(in_features=64, out_features=64, bias=True)
            )
            (mlp): StepCLIPMLP(
              (fc1): Linear(in_features=64, out_features=128, bias=True)
              (act): QuickGELUActivation()
              (fc2): Linear(in_features=128, out_features=64, bias=True)
            )
          )
        )
      )
    )
    (language_model): Step3Model(
      (embed_tokens): Embedding(128815, 32)
      (layers): ModuleList(
        (0): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (mlp): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
        (1): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (moe): Step3vMoEMLP(
            (gate): Linear(in_features=32, out_features=8, bias=False)
            (up_proj): MoELinear()
            (gate_proj): MoELinear()
            (down_proj): MoELinear()
            (act_fn): SiLU()
          )
          (share_expert): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
      )
      (norm): Step3vRMSNorm((32,), eps=1e-05)
      (rotary_emb): Step3vRotaryEmbedding()
    )
    (vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
    (vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
  )
  (lm_head): Linear(in_features=32, out_features=128815, bias=False)
)
Printing the model:text
Step3vForConditionalGeneration(
  (model): Step3vModel(
    (vision_model): StepCLIPVisionTransformer(
      (embeddings): StepCLIPVisionEmbeddings(
        (patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
        (position_embedding): Embedding(2705, 64)
      )
      (transformer): StepCLIPEncoder(
        (layers): ModuleList(
          (0-1): 2 x StepCLIPEncoderLayer(
            (layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (self_attn): StepCLIPAttention(
              (qkv_proj): Linear(in_features=64, out_features=192, bias=True)
              (out_proj): Linear(in_features=64, out_features=64, bias=True)
            )
            (mlp): StepCLIPMLP(
              (fc1): Linear(in_features=64, out_features=128, bias=True)
              (act): QuickGELUActivation()
              (fc2): Linear(in_features=128, out_features=64, bias=True)
            )
          )
        )
      )
    )
    (language_model): Step3Model(
      (embed_tokens): Embedding(128815, 32)
      (layers): ModuleList(
        (0): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (mlp): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
        (1): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (moe): Step3vMoEMLP(
            (gate): Linear(in_features=32, out_features=8, bias=False)
            (up_proj): MoELinear()
            (gate_proj): MoELinear()
            (down_proj): MoELinear()
            (act_fn): SiLU()
          )
          (share_expert): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
      )
      (norm): Step3vRMSNorm((32,), eps=1e-05)
      (rotary_emb): Step3vRotaryEmbedding()
    )
    (vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
    (vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
  )
  (lm_head): Linear(in_features=32, out_features=128815, bias=False)
)
Printing the model:text
Step3vForConditionalGeneration(
  (model): Step3vModel(
    (vision_model): StepCLIPVisionTransformer(
      (embeddings): StepCLIPVisionEmbeddings(
        (patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
        (position_embedding): Embedding(2705, 64)
      )
      (transformer): StepCLIPEncoder(
        (layers): ModuleList(
          (0-1): 2 x StepCLIPEncoderLayer(
            (layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (self_attn): StepCLIPAttention(
              (qkv_proj): Linear(in_features=64, out_features=192, bias=True)
              (out_proj): Linear(in_features=64, out_features=64, bias=True)
            )
            (mlp): StepCLIPMLP(
              (fc1): Linear(in_features=64, out_features=128, bias=True)
              (act): QuickGELUActivation()
              (fc2): Linear(in_features=128, out_features=64, bias=True)
            )
          )
        )
      )
    )
    (language_model): Step3Model(
      (embed_tokens): Embedding(128815, 32)
      (layers): ModuleList(
        (0): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (mlp): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
        (1): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (moe): Step3vMoEMLP(
            (gate): Linear(in_features=32, out_features=8, bias=False)
            (up_proj): MoELinear()
            (gate_proj): MoELinear()
            (down_proj): MoELinear()
            (act_fn): SiLU()
          )
          (share_expert): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
      )
      (norm): Step3vRMSNorm((32,), eps=1e-05)
      (rotary_emb): Step3vRotaryEmbedding()
    )
    (vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
    (vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
  )
  (lm_head): Linear(in_features=32, out_features=128815, bias=False)
)
Printing the model:text
Step3vForConditionalGeneration(
  (model): Step3vModel(
    (vision_model): StepCLIPVisionTransformer(
      (embeddings): StepCLIPVisionEmbeddings(
        (patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
        (position_embedding): Embedding(2705, 64)
      )
      (transformer): StepCLIPEncoder(
        (layers): ModuleList(
          (0-1): 2 x StepCLIPEncoderLayer(
            (layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
            (self_attn): StepCLIPAttention(
              (qkv_proj): Linear(in_features=64, out_features=192, bias=True)
              (out_proj): Linear(in_features=64, out_features=64, bias=True)
            )
            (mlp): StepCLIPMLP(
              (fc1): Linear(in_features=64, out_features=128, bias=True)
              (act): QuickGELUActivation()
              (fc2): Linear(in_features=128, out_features=64, bias=True)
            )
          )
        )
      )
    )
    (language_model): Step3Model(
      (embed_tokens): Embedding(128815, 32)
      (layers): ModuleList(
        (0): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (mlp): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
        (1): Step3vDecoderLayer(
          (self_attn): Step3vAttention(
            (q_proj): Linear(in_features=32, out_features=512, bias=False)
            (k_proj): Linear(in_features=32, out_features=256, bias=False)
            (v_proj): Linear(in_features=32, out_features=256, bias=False)
            (o_proj): Linear(in_features=512, out_features=32, bias=False)
            (inter_norm): Step3vRMSNorm((512,), eps=1e-05)
            (wq): Linear(in_features=512, out_features=512, bias=False)
          )
          (moe): Step3vMoEMLP(
            (gate): Linear(in_features=32, out_features=8, bias=False)
            (up_proj): MoELinear()
            (gate_proj): MoELinear()
            (down_proj): MoELinear()
            (act_fn): SiLU()
          )
          (share_expert): Step3vMLP(
            (gate_proj): Linear(in_features=32, out_features=64, bias=False)
            (up_proj): Linear(in_features=32, out_features=64, bias=False)
            (down_proj): Linear(in_features=64, out_features=32, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
          (post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
        )
      )
      (norm): Step3vRMSNorm((32,), eps=1e-05)
      (rotary_emb): Step3vRotaryEmbedding()
    )
    (vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
    (vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
  )
  (lm_head): Linear(in_features=32, out_features=128815, bias=False)
)

Deploy This Model

Production-ready deployment in minutes

Together.ai

Instant API access to this model

Fastest API

Production-ready inference API. Start free, scale to millions.

Try Free API

Replicate

One-click model deployment

Easiest Setup

Run models in the cloud with simple API. No DevOps required.

Deploy Now

Disclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.