SAILViT-Large-300M-448px
29
2
3 languages
license:apache-2.0
by
BytedanceDouyinContent
Image Model
OTHER
2507.01643B params
New
29 downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
5604GB+ RAM
Mobile
Laptop
Server
Quick Summary
AI model with specialized capabilities.
Device Compatibility
Mobile
4-6GB RAM
Laptop
16GB RAM
Server
GPU
Minimum Recommended
2335GB+ RAM
Code Examples
Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Usagepythontransformers
import torch
import torchvision.transforms as T
from PIL import Image
import requests
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
path = "BytedanceDouyinContent/SAILViT-Large-300M-448px"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
trust_remote_code=True).eval().cuda()
url = "https://img-blog.csdnimg.cn/fcc22710385e4edabccf2451d5f64a99.jpeg"
input_size = 448
image = Image.open(requests.get(url, stream=True).raw).convert('RGB')
transform = build_transform(input_size=input_size)
pixel_values = transform(image).unsqueeze(0).to(torch.bfloat16).cuda()
visual_tokens = model(pixel_values=pixel_values)Deploy This Model
Production-ready deployment in minutes
Together.ai
Instant API access to this model
Production-ready inference API. Start free, scale to millions.
Try Free APIReplicate
One-click model deployment
Run models in the cloud with simple API. No DevOps required.
Deploy NowDisclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.