SAILViT-Large-300M-448px

29
2
3 languages
license:apache-2.0
by
BytedanceDouyinContent
Image Model
OTHER
2507.01643B params
New
29 downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
5604GB+ RAM
Mobile
Laptop
Server
Quick Summary

AI model with specialized capabilities.

Device Compatibility

Mobile
4-6GB RAM
Laptop
16GB RAM
Server
GPU
Minimum Recommended
2335GB+ RAM

Code Examples

Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True).eval().cuda()

url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)

Deploy This Model

Production-ready deployment in minutes

Together.ai

Instant API access to this model

Fastest API

Production-ready inference API. Start free, scale to millions.

Try Free API

Replicate

One-click model deployment

Easiest Setup

Run models in the cloud with simple API. No DevOps required.

Deploy Now

Disclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.