step3-tiny-random-vllm
56
3.0B
—
by
yujiepan
Language Model
OTHER
3B params
New
56 downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
7GB+ RAM
Mobile
Laptop
Server
Quick Summary
This tiny model is for debugging.
Device Compatibility
Mobile
4-6GB RAM
Laptop
16GB RAM
Server
GPU
Minimum Recommended
3GB+ RAM
Code Examples
Printing the model:text
Step3vForConditionalGeneration(
(model): Step3vModel(
(vision_model): StepCLIPVisionTransformer(
(embeddings): StepCLIPVisionEmbeddings(
(patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
(position_embedding): Embedding(2705, 64)
)
(transformer): StepCLIPEncoder(
(layers): ModuleList(
(0-1): 2 x StepCLIPEncoderLayer(
(layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(self_attn): StepCLIPAttention(
(qkv_proj): Linear(in_features=64, out_features=192, bias=True)
(out_proj): Linear(in_features=64, out_features=64, bias=True)
)
(mlp): StepCLIPMLP(
(fc1): Linear(in_features=64, out_features=128, bias=True)
(act): QuickGELUActivation()
(fc2): Linear(in_features=128, out_features=64, bias=True)
)
)
)
)
)
(language_model): Step3Model(
(embed_tokens): Embedding(128815, 32)
(layers): ModuleList(
(0): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(mlp): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
(1): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(moe): Step3vMoEMLP(
(gate): Linear(in_features=32, out_features=8, bias=False)
(up_proj): MoELinear()
(gate_proj): MoELinear()
(down_proj): MoELinear()
(act_fn): SiLU()
)
(share_expert): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
)
(norm): Step3vRMSNorm((32,), eps=1e-05)
(rotary_emb): Step3vRotaryEmbedding()
)
(vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
(vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
)
(lm_head): Linear(in_features=32, out_features=128815, bias=False)
)Printing the model:text
Step3vForConditionalGeneration(
(model): Step3vModel(
(vision_model): StepCLIPVisionTransformer(
(embeddings): StepCLIPVisionEmbeddings(
(patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
(position_embedding): Embedding(2705, 64)
)
(transformer): StepCLIPEncoder(
(layers): ModuleList(
(0-1): 2 x StepCLIPEncoderLayer(
(layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(self_attn): StepCLIPAttention(
(qkv_proj): Linear(in_features=64, out_features=192, bias=True)
(out_proj): Linear(in_features=64, out_features=64, bias=True)
)
(mlp): StepCLIPMLP(
(fc1): Linear(in_features=64, out_features=128, bias=True)
(act): QuickGELUActivation()
(fc2): Linear(in_features=128, out_features=64, bias=True)
)
)
)
)
)
(language_model): Step3Model(
(embed_tokens): Embedding(128815, 32)
(layers): ModuleList(
(0): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(mlp): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
(1): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(moe): Step3vMoEMLP(
(gate): Linear(in_features=32, out_features=8, bias=False)
(up_proj): MoELinear()
(gate_proj): MoELinear()
(down_proj): MoELinear()
(act_fn): SiLU()
)
(share_expert): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
)
(norm): Step3vRMSNorm((32,), eps=1e-05)
(rotary_emb): Step3vRotaryEmbedding()
)
(vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
(vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
)
(lm_head): Linear(in_features=32, out_features=128815, bias=False)
)Printing the model:text
Step3vForConditionalGeneration(
(model): Step3vModel(
(vision_model): StepCLIPVisionTransformer(
(embeddings): StepCLIPVisionEmbeddings(
(patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
(position_embedding): Embedding(2705, 64)
)
(transformer): StepCLIPEncoder(
(layers): ModuleList(
(0-1): 2 x StepCLIPEncoderLayer(
(layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(self_attn): StepCLIPAttention(
(qkv_proj): Linear(in_features=64, out_features=192, bias=True)
(out_proj): Linear(in_features=64, out_features=64, bias=True)
)
(mlp): StepCLIPMLP(
(fc1): Linear(in_features=64, out_features=128, bias=True)
(act): QuickGELUActivation()
(fc2): Linear(in_features=128, out_features=64, bias=True)
)
)
)
)
)
(language_model): Step3Model(
(embed_tokens): Embedding(128815, 32)
(layers): ModuleList(
(0): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(mlp): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
(1): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(moe): Step3vMoEMLP(
(gate): Linear(in_features=32, out_features=8, bias=False)
(up_proj): MoELinear()
(gate_proj): MoELinear()
(down_proj): MoELinear()
(act_fn): SiLU()
)
(share_expert): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
)
(norm): Step3vRMSNorm((32,), eps=1e-05)
(rotary_emb): Step3vRotaryEmbedding()
)
(vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
(vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
)
(lm_head): Linear(in_features=32, out_features=128815, bias=False)
)Printing the model:text
Step3vForConditionalGeneration(
(model): Step3vModel(
(vision_model): StepCLIPVisionTransformer(
(embeddings): StepCLIPVisionEmbeddings(
(patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
(position_embedding): Embedding(2705, 64)
)
(transformer): StepCLIPEncoder(
(layers): ModuleList(
(0-1): 2 x StepCLIPEncoderLayer(
(layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(self_attn): StepCLIPAttention(
(qkv_proj): Linear(in_features=64, out_features=192, bias=True)
(out_proj): Linear(in_features=64, out_features=64, bias=True)
)
(mlp): StepCLIPMLP(
(fc1): Linear(in_features=64, out_features=128, bias=True)
(act): QuickGELUActivation()
(fc2): Linear(in_features=128, out_features=64, bias=True)
)
)
)
)
)
(language_model): Step3Model(
(embed_tokens): Embedding(128815, 32)
(layers): ModuleList(
(0): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(mlp): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
(1): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(moe): Step3vMoEMLP(
(gate): Linear(in_features=32, out_features=8, bias=False)
(up_proj): MoELinear()
(gate_proj): MoELinear()
(down_proj): MoELinear()
(act_fn): SiLU()
)
(share_expert): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
)
(norm): Step3vRMSNorm((32,), eps=1e-05)
(rotary_emb): Step3vRotaryEmbedding()
)
(vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
(vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
)
(lm_head): Linear(in_features=32, out_features=128815, bias=False)
)Printing the model:text
Step3vForConditionalGeneration(
(model): Step3vModel(
(vision_model): StepCLIPVisionTransformer(
(embeddings): StepCLIPVisionEmbeddings(
(patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
(position_embedding): Embedding(2705, 64)
)
(transformer): StepCLIPEncoder(
(layers): ModuleList(
(0-1): 2 x StepCLIPEncoderLayer(
(layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(self_attn): StepCLIPAttention(
(qkv_proj): Linear(in_features=64, out_features=192, bias=True)
(out_proj): Linear(in_features=64, out_features=64, bias=True)
)
(mlp): StepCLIPMLP(
(fc1): Linear(in_features=64, out_features=128, bias=True)
(act): QuickGELUActivation()
(fc2): Linear(in_features=128, out_features=64, bias=True)
)
)
)
)
)
(language_model): Step3Model(
(embed_tokens): Embedding(128815, 32)
(layers): ModuleList(
(0): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(mlp): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
(1): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(moe): Step3vMoEMLP(
(gate): Linear(in_features=32, out_features=8, bias=False)
(up_proj): MoELinear()
(gate_proj): MoELinear()
(down_proj): MoELinear()
(act_fn): SiLU()
)
(share_expert): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
)
(norm): Step3vRMSNorm((32,), eps=1e-05)
(rotary_emb): Step3vRotaryEmbedding()
)
(vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
(vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
)
(lm_head): Linear(in_features=32, out_features=128815, bias=False)
)Printing the model:text
Step3vForConditionalGeneration(
(model): Step3vModel(
(vision_model): StepCLIPVisionTransformer(
(embeddings): StepCLIPVisionEmbeddings(
(patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
(position_embedding): Embedding(2705, 64)
)
(transformer): StepCLIPEncoder(
(layers): ModuleList(
(0-1): 2 x StepCLIPEncoderLayer(
(layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(self_attn): StepCLIPAttention(
(qkv_proj): Linear(in_features=64, out_features=192, bias=True)
(out_proj): Linear(in_features=64, out_features=64, bias=True)
)
(mlp): StepCLIPMLP(
(fc1): Linear(in_features=64, out_features=128, bias=True)
(act): QuickGELUActivation()
(fc2): Linear(in_features=128, out_features=64, bias=True)
)
)
)
)
)
(language_model): Step3Model(
(embed_tokens): Embedding(128815, 32)
(layers): ModuleList(
(0): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(mlp): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
(1): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(moe): Step3vMoEMLP(
(gate): Linear(in_features=32, out_features=8, bias=False)
(up_proj): MoELinear()
(gate_proj): MoELinear()
(down_proj): MoELinear()
(act_fn): SiLU()
)
(share_expert): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
)
(norm): Step3vRMSNorm((32,), eps=1e-05)
(rotary_emb): Step3vRotaryEmbedding()
)
(vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
(vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
)
(lm_head): Linear(in_features=32, out_features=128815, bias=False)
)Printing the model:text
Step3vForConditionalGeneration(
(model): Step3vModel(
(vision_model): StepCLIPVisionTransformer(
(embeddings): StepCLIPVisionEmbeddings(
(patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
(position_embedding): Embedding(2705, 64)
)
(transformer): StepCLIPEncoder(
(layers): ModuleList(
(0-1): 2 x StepCLIPEncoderLayer(
(layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(self_attn): StepCLIPAttention(
(qkv_proj): Linear(in_features=64, out_features=192, bias=True)
(out_proj): Linear(in_features=64, out_features=64, bias=True)
)
(mlp): StepCLIPMLP(
(fc1): Linear(in_features=64, out_features=128, bias=True)
(act): QuickGELUActivation()
(fc2): Linear(in_features=128, out_features=64, bias=True)
)
)
)
)
)
(language_model): Step3Model(
(embed_tokens): Embedding(128815, 32)
(layers): ModuleList(
(0): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(mlp): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
(1): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(moe): Step3vMoEMLP(
(gate): Linear(in_features=32, out_features=8, bias=False)
(up_proj): MoELinear()
(gate_proj): MoELinear()
(down_proj): MoELinear()
(act_fn): SiLU()
)
(share_expert): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
)
(norm): Step3vRMSNorm((32,), eps=1e-05)
(rotary_emb): Step3vRotaryEmbedding()
)
(vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
(vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
)
(lm_head): Linear(in_features=32, out_features=128815, bias=False)
)Printing the model:text
Step3vForConditionalGeneration(
(model): Step3vModel(
(vision_model): StepCLIPVisionTransformer(
(embeddings): StepCLIPVisionEmbeddings(
(patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
(position_embedding): Embedding(2705, 64)
)
(transformer): StepCLIPEncoder(
(layers): ModuleList(
(0-1): 2 x StepCLIPEncoderLayer(
(layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(self_attn): StepCLIPAttention(
(qkv_proj): Linear(in_features=64, out_features=192, bias=True)
(out_proj): Linear(in_features=64, out_features=64, bias=True)
)
(mlp): StepCLIPMLP(
(fc1): Linear(in_features=64, out_features=128, bias=True)
(act): QuickGELUActivation()
(fc2): Linear(in_features=128, out_features=64, bias=True)
)
)
)
)
)
(language_model): Step3Model(
(embed_tokens): Embedding(128815, 32)
(layers): ModuleList(
(0): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(mlp): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
(1): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(moe): Step3vMoEMLP(
(gate): Linear(in_features=32, out_features=8, bias=False)
(up_proj): MoELinear()
(gate_proj): MoELinear()
(down_proj): MoELinear()
(act_fn): SiLU()
)
(share_expert): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
)
(norm): Step3vRMSNorm((32,), eps=1e-05)
(rotary_emb): Step3vRotaryEmbedding()
)
(vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
(vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
)
(lm_head): Linear(in_features=32, out_features=128815, bias=False)
)Printing the model:text
Step3vForConditionalGeneration(
(model): Step3vModel(
(vision_model): StepCLIPVisionTransformer(
(embeddings): StepCLIPVisionEmbeddings(
(patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
(position_embedding): Embedding(2705, 64)
)
(transformer): StepCLIPEncoder(
(layers): ModuleList(
(0-1): 2 x StepCLIPEncoderLayer(
(layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(self_attn): StepCLIPAttention(
(qkv_proj): Linear(in_features=64, out_features=192, bias=True)
(out_proj): Linear(in_features=64, out_features=64, bias=True)
)
(mlp): StepCLIPMLP(
(fc1): Linear(in_features=64, out_features=128, bias=True)
(act): QuickGELUActivation()
(fc2): Linear(in_features=128, out_features=64, bias=True)
)
)
)
)
)
(language_model): Step3Model(
(embed_tokens): Embedding(128815, 32)
(layers): ModuleList(
(0): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(mlp): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
(1): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(moe): Step3vMoEMLP(
(gate): Linear(in_features=32, out_features=8, bias=False)
(up_proj): MoELinear()
(gate_proj): MoELinear()
(down_proj): MoELinear()
(act_fn): SiLU()
)
(share_expert): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
)
(norm): Step3vRMSNorm((32,), eps=1e-05)
(rotary_emb): Step3vRotaryEmbedding()
)
(vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
(vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
)
(lm_head): Linear(in_features=32, out_features=128815, bias=False)
)Printing the model:text
Step3vForConditionalGeneration(
(model): Step3vModel(
(vision_model): StepCLIPVisionTransformer(
(embeddings): StepCLIPVisionEmbeddings(
(patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
(position_embedding): Embedding(2705, 64)
)
(transformer): StepCLIPEncoder(
(layers): ModuleList(
(0-1): 2 x StepCLIPEncoderLayer(
(layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(self_attn): StepCLIPAttention(
(qkv_proj): Linear(in_features=64, out_features=192, bias=True)
(out_proj): Linear(in_features=64, out_features=64, bias=True)
)
(mlp): StepCLIPMLP(
(fc1): Linear(in_features=64, out_features=128, bias=True)
(act): QuickGELUActivation()
(fc2): Linear(in_features=128, out_features=64, bias=True)
)
)
)
)
)
(language_model): Step3Model(
(embed_tokens): Embedding(128815, 32)
(layers): ModuleList(
(0): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(mlp): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
(1): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(moe): Step3vMoEMLP(
(gate): Linear(in_features=32, out_features=8, bias=False)
(up_proj): MoELinear()
(gate_proj): MoELinear()
(down_proj): MoELinear()
(act_fn): SiLU()
)
(share_expert): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
)
(norm): Step3vRMSNorm((32,), eps=1e-05)
(rotary_emb): Step3vRotaryEmbedding()
)
(vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
(vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
)
(lm_head): Linear(in_features=32, out_features=128815, bias=False)
)Printing the model:text
Step3vForConditionalGeneration(
(model): Step3vModel(
(vision_model): StepCLIPVisionTransformer(
(embeddings): StepCLIPVisionEmbeddings(
(patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
(position_embedding): Embedding(2705, 64)
)
(transformer): StepCLIPEncoder(
(layers): ModuleList(
(0-1): 2 x StepCLIPEncoderLayer(
(layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(self_attn): StepCLIPAttention(
(qkv_proj): Linear(in_features=64, out_features=192, bias=True)
(out_proj): Linear(in_features=64, out_features=64, bias=True)
)
(mlp): StepCLIPMLP(
(fc1): Linear(in_features=64, out_features=128, bias=True)
(act): QuickGELUActivation()
(fc2): Linear(in_features=128, out_features=64, bias=True)
)
)
)
)
)
(language_model): Step3Model(
(embed_tokens): Embedding(128815, 32)
(layers): ModuleList(
(0): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(mlp): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
(1): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(moe): Step3vMoEMLP(
(gate): Linear(in_features=32, out_features=8, bias=False)
(up_proj): MoELinear()
(gate_proj): MoELinear()
(down_proj): MoELinear()
(act_fn): SiLU()
)
(share_expert): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
)
(norm): Step3vRMSNorm((32,), eps=1e-05)
(rotary_emb): Step3vRotaryEmbedding()
)
(vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
(vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
)
(lm_head): Linear(in_features=32, out_features=128815, bias=False)
)Printing the model:text
Step3vForConditionalGeneration(
(model): Step3vModel(
(vision_model): StepCLIPVisionTransformer(
(embeddings): StepCLIPVisionEmbeddings(
(patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
(position_embedding): Embedding(2705, 64)
)
(transformer): StepCLIPEncoder(
(layers): ModuleList(
(0-1): 2 x StepCLIPEncoderLayer(
(layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(self_attn): StepCLIPAttention(
(qkv_proj): Linear(in_features=64, out_features=192, bias=True)
(out_proj): Linear(in_features=64, out_features=64, bias=True)
)
(mlp): StepCLIPMLP(
(fc1): Linear(in_features=64, out_features=128, bias=True)
(act): QuickGELUActivation()
(fc2): Linear(in_features=128, out_features=64, bias=True)
)
)
)
)
)
(language_model): Step3Model(
(embed_tokens): Embedding(128815, 32)
(layers): ModuleList(
(0): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(mlp): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
(1): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(moe): Step3vMoEMLP(
(gate): Linear(in_features=32, out_features=8, bias=False)
(up_proj): MoELinear()
(gate_proj): MoELinear()
(down_proj): MoELinear()
(act_fn): SiLU()
)
(share_expert): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
)
(norm): Step3vRMSNorm((32,), eps=1e-05)
(rotary_emb): Step3vRotaryEmbedding()
)
(vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
(vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
)
(lm_head): Linear(in_features=32, out_features=128815, bias=False)
)Printing the model:text
Step3vForConditionalGeneration(
(model): Step3vModel(
(vision_model): StepCLIPVisionTransformer(
(embeddings): StepCLIPVisionEmbeddings(
(patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
(position_embedding): Embedding(2705, 64)
)
(transformer): StepCLIPEncoder(
(layers): ModuleList(
(0-1): 2 x StepCLIPEncoderLayer(
(layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(self_attn): StepCLIPAttention(
(qkv_proj): Linear(in_features=64, out_features=192, bias=True)
(out_proj): Linear(in_features=64, out_features=64, bias=True)
)
(mlp): StepCLIPMLP(
(fc1): Linear(in_features=64, out_features=128, bias=True)
(act): QuickGELUActivation()
(fc2): Linear(in_features=128, out_features=64, bias=True)
)
)
)
)
)
(language_model): Step3Model(
(embed_tokens): Embedding(128815, 32)
(layers): ModuleList(
(0): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(mlp): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
(1): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(moe): Step3vMoEMLP(
(gate): Linear(in_features=32, out_features=8, bias=False)
(up_proj): MoELinear()
(gate_proj): MoELinear()
(down_proj): MoELinear()
(act_fn): SiLU()
)
(share_expert): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
)
(norm): Step3vRMSNorm((32,), eps=1e-05)
(rotary_emb): Step3vRotaryEmbedding()
)
(vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
(vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
)
(lm_head): Linear(in_features=32, out_features=128815, bias=False)
)Printing the model:text
Step3vForConditionalGeneration(
(model): Step3vModel(
(vision_model): StepCLIPVisionTransformer(
(embeddings): StepCLIPVisionEmbeddings(
(patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
(position_embedding): Embedding(2705, 64)
)
(transformer): StepCLIPEncoder(
(layers): ModuleList(
(0-1): 2 x StepCLIPEncoderLayer(
(layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(self_attn): StepCLIPAttention(
(qkv_proj): Linear(in_features=64, out_features=192, bias=True)
(out_proj): Linear(in_features=64, out_features=64, bias=True)
)
(mlp): StepCLIPMLP(
(fc1): Linear(in_features=64, out_features=128, bias=True)
(act): QuickGELUActivation()
(fc2): Linear(in_features=128, out_features=64, bias=True)
)
)
)
)
)
(language_model): Step3Model(
(embed_tokens): Embedding(128815, 32)
(layers): ModuleList(
(0): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(mlp): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
(1): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(moe): Step3vMoEMLP(
(gate): Linear(in_features=32, out_features=8, bias=False)
(up_proj): MoELinear()
(gate_proj): MoELinear()
(down_proj): MoELinear()
(act_fn): SiLU()
)
(share_expert): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
)
(norm): Step3vRMSNorm((32,), eps=1e-05)
(rotary_emb): Step3vRotaryEmbedding()
)
(vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
(vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
)
(lm_head): Linear(in_features=32, out_features=128815, bias=False)
)Printing the model:text
Step3vForConditionalGeneration(
(model): Step3vModel(
(vision_model): StepCLIPVisionTransformer(
(embeddings): StepCLIPVisionEmbeddings(
(patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
(position_embedding): Embedding(2705, 64)
)
(transformer): StepCLIPEncoder(
(layers): ModuleList(
(0-1): 2 x StepCLIPEncoderLayer(
(layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(self_attn): StepCLIPAttention(
(qkv_proj): Linear(in_features=64, out_features=192, bias=True)
(out_proj): Linear(in_features=64, out_features=64, bias=True)
)
(mlp): StepCLIPMLP(
(fc1): Linear(in_features=64, out_features=128, bias=True)
(act): QuickGELUActivation()
(fc2): Linear(in_features=128, out_features=64, bias=True)
)
)
)
)
)
(language_model): Step3Model(
(embed_tokens): Embedding(128815, 32)
(layers): ModuleList(
(0): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(mlp): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
(1): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(moe): Step3vMoEMLP(
(gate): Linear(in_features=32, out_features=8, bias=False)
(up_proj): MoELinear()
(gate_proj): MoELinear()
(down_proj): MoELinear()
(act_fn): SiLU()
)
(share_expert): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
)
(norm): Step3vRMSNorm((32,), eps=1e-05)
(rotary_emb): Step3vRotaryEmbedding()
)
(vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
(vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
)
(lm_head): Linear(in_features=32, out_features=128815, bias=False)
)Printing the model:text
Step3vForConditionalGeneration(
(model): Step3vModel(
(vision_model): StepCLIPVisionTransformer(
(embeddings): StepCLIPVisionEmbeddings(
(patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
(position_embedding): Embedding(2705, 64)
)
(transformer): StepCLIPEncoder(
(layers): ModuleList(
(0-1): 2 x StepCLIPEncoderLayer(
(layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(self_attn): StepCLIPAttention(
(qkv_proj): Linear(in_features=64, out_features=192, bias=True)
(out_proj): Linear(in_features=64, out_features=64, bias=True)
)
(mlp): StepCLIPMLP(
(fc1): Linear(in_features=64, out_features=128, bias=True)
(act): QuickGELUActivation()
(fc2): Linear(in_features=128, out_features=64, bias=True)
)
)
)
)
)
(language_model): Step3Model(
(embed_tokens): Embedding(128815, 32)
(layers): ModuleList(
(0): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(mlp): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
(1): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(moe): Step3vMoEMLP(
(gate): Linear(in_features=32, out_features=8, bias=False)
(up_proj): MoELinear()
(gate_proj): MoELinear()
(down_proj): MoELinear()
(act_fn): SiLU()
)
(share_expert): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
)
(norm): Step3vRMSNorm((32,), eps=1e-05)
(rotary_emb): Step3vRotaryEmbedding()
)
(vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
(vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
)
(lm_head): Linear(in_features=32, out_features=128815, bias=False)
)Printing the model:text
Step3vForConditionalGeneration(
(model): Step3vModel(
(vision_model): StepCLIPVisionTransformer(
(embeddings): StepCLIPVisionEmbeddings(
(patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
(position_embedding): Embedding(2705, 64)
)
(transformer): StepCLIPEncoder(
(layers): ModuleList(
(0-1): 2 x StepCLIPEncoderLayer(
(layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(self_attn): StepCLIPAttention(
(qkv_proj): Linear(in_features=64, out_features=192, bias=True)
(out_proj): Linear(in_features=64, out_features=64, bias=True)
)
(mlp): StepCLIPMLP(
(fc1): Linear(in_features=64, out_features=128, bias=True)
(act): QuickGELUActivation()
(fc2): Linear(in_features=128, out_features=64, bias=True)
)
)
)
)
)
(language_model): Step3Model(
(embed_tokens): Embedding(128815, 32)
(layers): ModuleList(
(0): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(mlp): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
(1): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(moe): Step3vMoEMLP(
(gate): Linear(in_features=32, out_features=8, bias=False)
(up_proj): MoELinear()
(gate_proj): MoELinear()
(down_proj): MoELinear()
(act_fn): SiLU()
)
(share_expert): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
)
(norm): Step3vRMSNorm((32,), eps=1e-05)
(rotary_emb): Step3vRotaryEmbedding()
)
(vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
(vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
)
(lm_head): Linear(in_features=32, out_features=128815, bias=False)
)Printing the model:text
Step3vForConditionalGeneration(
(model): Step3vModel(
(vision_model): StepCLIPVisionTransformer(
(embeddings): StepCLIPVisionEmbeddings(
(patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
(position_embedding): Embedding(2705, 64)
)
(transformer): StepCLIPEncoder(
(layers): ModuleList(
(0-1): 2 x StepCLIPEncoderLayer(
(layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(self_attn): StepCLIPAttention(
(qkv_proj): Linear(in_features=64, out_features=192, bias=True)
(out_proj): Linear(in_features=64, out_features=64, bias=True)
)
(mlp): StepCLIPMLP(
(fc1): Linear(in_features=64, out_features=128, bias=True)
(act): QuickGELUActivation()
(fc2): Linear(in_features=128, out_features=64, bias=True)
)
)
)
)
)
(language_model): Step3Model(
(embed_tokens): Embedding(128815, 32)
(layers): ModuleList(
(0): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(mlp): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
(1): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(moe): Step3vMoEMLP(
(gate): Linear(in_features=32, out_features=8, bias=False)
(up_proj): MoELinear()
(gate_proj): MoELinear()
(down_proj): MoELinear()
(act_fn): SiLU()
)
(share_expert): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
)
(norm): Step3vRMSNorm((32,), eps=1e-05)
(rotary_emb): Step3vRotaryEmbedding()
)
(vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
(vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
)
(lm_head): Linear(in_features=32, out_features=128815, bias=False)
)Printing the model:text
Step3vForConditionalGeneration(
(model): Step3vModel(
(vision_model): StepCLIPVisionTransformer(
(embeddings): StepCLIPVisionEmbeddings(
(patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
(position_embedding): Embedding(2705, 64)
)
(transformer): StepCLIPEncoder(
(layers): ModuleList(
(0-1): 2 x StepCLIPEncoderLayer(
(layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(self_attn): StepCLIPAttention(
(qkv_proj): Linear(in_features=64, out_features=192, bias=True)
(out_proj): Linear(in_features=64, out_features=64, bias=True)
)
(mlp): StepCLIPMLP(
(fc1): Linear(in_features=64, out_features=128, bias=True)
(act): QuickGELUActivation()
(fc2): Linear(in_features=128, out_features=64, bias=True)
)
)
)
)
)
(language_model): Step3Model(
(embed_tokens): Embedding(128815, 32)
(layers): ModuleList(
(0): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(mlp): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
(1): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(moe): Step3vMoEMLP(
(gate): Linear(in_features=32, out_features=8, bias=False)
(up_proj): MoELinear()
(gate_proj): MoELinear()
(down_proj): MoELinear()
(act_fn): SiLU()
)
(share_expert): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
)
(norm): Step3vRMSNorm((32,), eps=1e-05)
(rotary_emb): Step3vRotaryEmbedding()
)
(vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
(vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
)
(lm_head): Linear(in_features=32, out_features=128815, bias=False)
)Printing the model:text
Step3vForConditionalGeneration(
(model): Step3vModel(
(vision_model): StepCLIPVisionTransformer(
(embeddings): StepCLIPVisionEmbeddings(
(patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
(position_embedding): Embedding(2705, 64)
)
(transformer): StepCLIPEncoder(
(layers): ModuleList(
(0-1): 2 x StepCLIPEncoderLayer(
(layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(self_attn): StepCLIPAttention(
(qkv_proj): Linear(in_features=64, out_features=192, bias=True)
(out_proj): Linear(in_features=64, out_features=64, bias=True)
)
(mlp): StepCLIPMLP(
(fc1): Linear(in_features=64, out_features=128, bias=True)
(act): QuickGELUActivation()
(fc2): Linear(in_features=128, out_features=64, bias=True)
)
)
)
)
)
(language_model): Step3Model(
(embed_tokens): Embedding(128815, 32)
(layers): ModuleList(
(0): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(mlp): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
(1): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(moe): Step3vMoEMLP(
(gate): Linear(in_features=32, out_features=8, bias=False)
(up_proj): MoELinear()
(gate_proj): MoELinear()
(down_proj): MoELinear()
(act_fn): SiLU()
)
(share_expert): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
)
(norm): Step3vRMSNorm((32,), eps=1e-05)
(rotary_emb): Step3vRotaryEmbedding()
)
(vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
(vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
)
(lm_head): Linear(in_features=32, out_features=128815, bias=False)
)Printing the model:text
Step3vForConditionalGeneration(
(model): Step3vModel(
(vision_model): StepCLIPVisionTransformer(
(embeddings): StepCLIPVisionEmbeddings(
(patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
(position_embedding): Embedding(2705, 64)
)
(transformer): StepCLIPEncoder(
(layers): ModuleList(
(0-1): 2 x StepCLIPEncoderLayer(
(layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(self_attn): StepCLIPAttention(
(qkv_proj): Linear(in_features=64, out_features=192, bias=True)
(out_proj): Linear(in_features=64, out_features=64, bias=True)
)
(mlp): StepCLIPMLP(
(fc1): Linear(in_features=64, out_features=128, bias=True)
(act): QuickGELUActivation()
(fc2): Linear(in_features=128, out_features=64, bias=True)
)
)
)
)
)
(language_model): Step3Model(
(embed_tokens): Embedding(128815, 32)
(layers): ModuleList(
(0): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(mlp): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
(1): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(moe): Step3vMoEMLP(
(gate): Linear(in_features=32, out_features=8, bias=False)
(up_proj): MoELinear()
(gate_proj): MoELinear()
(down_proj): MoELinear()
(act_fn): SiLU()
)
(share_expert): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
)
(norm): Step3vRMSNorm((32,), eps=1e-05)
(rotary_emb): Step3vRotaryEmbedding()
)
(vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
(vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
)
(lm_head): Linear(in_features=32, out_features=128815, bias=False)
)Printing the model:text
Step3vForConditionalGeneration(
(model): Step3vModel(
(vision_model): StepCLIPVisionTransformer(
(embeddings): StepCLIPVisionEmbeddings(
(patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
(position_embedding): Embedding(2705, 64)
)
(transformer): StepCLIPEncoder(
(layers): ModuleList(
(0-1): 2 x StepCLIPEncoderLayer(
(layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(self_attn): StepCLIPAttention(
(qkv_proj): Linear(in_features=64, out_features=192, bias=True)
(out_proj): Linear(in_features=64, out_features=64, bias=True)
)
(mlp): StepCLIPMLP(
(fc1): Linear(in_features=64, out_features=128, bias=True)
(act): QuickGELUActivation()
(fc2): Linear(in_features=128, out_features=64, bias=True)
)
)
)
)
)
(language_model): Step3Model(
(embed_tokens): Embedding(128815, 32)
(layers): ModuleList(
(0): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(mlp): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
(1): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(moe): Step3vMoEMLP(
(gate): Linear(in_features=32, out_features=8, bias=False)
(up_proj): MoELinear()
(gate_proj): MoELinear()
(down_proj): MoELinear()
(act_fn): SiLU()
)
(share_expert): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
)
(norm): Step3vRMSNorm((32,), eps=1e-05)
(rotary_emb): Step3vRotaryEmbedding()
)
(vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
(vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
)
(lm_head): Linear(in_features=32, out_features=128815, bias=False)
)Printing the model:text
Step3vForConditionalGeneration(
(model): Step3vModel(
(vision_model): StepCLIPVisionTransformer(
(embeddings): StepCLIPVisionEmbeddings(
(patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
(position_embedding): Embedding(2705, 64)
)
(transformer): StepCLIPEncoder(
(layers): ModuleList(
(0-1): 2 x StepCLIPEncoderLayer(
(layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(self_attn): StepCLIPAttention(
(qkv_proj): Linear(in_features=64, out_features=192, bias=True)
(out_proj): Linear(in_features=64, out_features=64, bias=True)
)
(mlp): StepCLIPMLP(
(fc1): Linear(in_features=64, out_features=128, bias=True)
(act): QuickGELUActivation()
(fc2): Linear(in_features=128, out_features=64, bias=True)
)
)
)
)
)
(language_model): Step3Model(
(embed_tokens): Embedding(128815, 32)
(layers): ModuleList(
(0): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(mlp): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
(1): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(moe): Step3vMoEMLP(
(gate): Linear(in_features=32, out_features=8, bias=False)
(up_proj): MoELinear()
(gate_proj): MoELinear()
(down_proj): MoELinear()
(act_fn): SiLU()
)
(share_expert): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
)
(norm): Step3vRMSNorm((32,), eps=1e-05)
(rotary_emb): Step3vRotaryEmbedding()
)
(vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
(vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
)
(lm_head): Linear(in_features=32, out_features=128815, bias=False)
)Printing the model:text
Step3vForConditionalGeneration(
(model): Step3vModel(
(vision_model): StepCLIPVisionTransformer(
(embeddings): StepCLIPVisionEmbeddings(
(patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
(position_embedding): Embedding(2705, 64)
)
(transformer): StepCLIPEncoder(
(layers): ModuleList(
(0-1): 2 x StepCLIPEncoderLayer(
(layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(self_attn): StepCLIPAttention(
(qkv_proj): Linear(in_features=64, out_features=192, bias=True)
(out_proj): Linear(in_features=64, out_features=64, bias=True)
)
(mlp): StepCLIPMLP(
(fc1): Linear(in_features=64, out_features=128, bias=True)
(act): QuickGELUActivation()
(fc2): Linear(in_features=128, out_features=64, bias=True)
)
)
)
)
)
(language_model): Step3Model(
(embed_tokens): Embedding(128815, 32)
(layers): ModuleList(
(0): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(mlp): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
(1): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(moe): Step3vMoEMLP(
(gate): Linear(in_features=32, out_features=8, bias=False)
(up_proj): MoELinear()
(gate_proj): MoELinear()
(down_proj): MoELinear()
(act_fn): SiLU()
)
(share_expert): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
)
(norm): Step3vRMSNorm((32,), eps=1e-05)
(rotary_emb): Step3vRotaryEmbedding()
)
(vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
(vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
)
(lm_head): Linear(in_features=32, out_features=128815, bias=False)
)Printing the model:text
Step3vForConditionalGeneration(
(model): Step3vModel(
(vision_model): StepCLIPVisionTransformer(
(embeddings): StepCLIPVisionEmbeddings(
(patch_embedding): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14))
(position_embedding): Embedding(2705, 64)
)
(transformer): StepCLIPEncoder(
(layers): ModuleList(
(0-1): 2 x StepCLIPEncoderLayer(
(layer_norm1): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(layer_norm2): LayerNorm((64,), eps=1e-06, elementwise_affine=True)
(self_attn): StepCLIPAttention(
(qkv_proj): Linear(in_features=64, out_features=192, bias=True)
(out_proj): Linear(in_features=64, out_features=64, bias=True)
)
(mlp): StepCLIPMLP(
(fc1): Linear(in_features=64, out_features=128, bias=True)
(act): QuickGELUActivation()
(fc2): Linear(in_features=128, out_features=64, bias=True)
)
)
)
)
)
(language_model): Step3Model(
(embed_tokens): Embedding(128815, 32)
(layers): ModuleList(
(0): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(mlp): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
(1): Step3vDecoderLayer(
(self_attn): Step3vAttention(
(q_proj): Linear(in_features=32, out_features=512, bias=False)
(k_proj): Linear(in_features=32, out_features=256, bias=False)
(v_proj): Linear(in_features=32, out_features=256, bias=False)
(o_proj): Linear(in_features=512, out_features=32, bias=False)
(inter_norm): Step3vRMSNorm((512,), eps=1e-05)
(wq): Linear(in_features=512, out_features=512, bias=False)
)
(moe): Step3vMoEMLP(
(gate): Linear(in_features=32, out_features=8, bias=False)
(up_proj): MoELinear()
(gate_proj): MoELinear()
(down_proj): MoELinear()
(act_fn): SiLU()
)
(share_expert): Step3vMLP(
(gate_proj): Linear(in_features=32, out_features=64, bias=False)
(up_proj): Linear(in_features=32, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=32, bias=False)
(act_fn): SiLU()
)
(input_layernorm): Step3vRMSNorm((32,), eps=1e-05)
(post_attention_layernorm): Step3vRMSNorm((32,), eps=1e-05)
)
)
(norm): Step3vRMSNorm((32,), eps=1e-05)
(rotary_emb): Step3vRotaryEmbedding()
)
(vit_downsampler): Conv2d(64, 64, kernel_size=(2, 2), stride=(2, 2))
(vit_downsampler2): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
(vit_large_projector): Linear(in_features=128, out_features=32, bias=False)
)
(lm_head): Linear(in_features=32, out_features=128815, bias=False)
)Deploy This Model
Production-ready deployment in minutes
Together.ai
Instant API access to this model
Production-ready inference API. Start free, scale to millions.
Try Free APIReplicate
One-click model deployment
Run models in the cloud with simple API. No DevOps required.
Deploy NowDisclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.