LLM2CLIP-EVA02-B-16
64
10
license:apache-2.0
by
microsoft
Code Model
OTHER
New
64 downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
Unknown
Mobile
Laptop
Server
Quick Summary
AI model with specialized capabilities.
Code Examples
Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Usagepythontransformers
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from transformers import AutoModel, AutoConfig, AutoTokenizer
from eva_clip import create_model_and_transforms
from llm2vec import LLM2Vec
from PIL import Image
import torch
model, _, preprocess_val = create_model_and_transforms('EVA02-CLIP-B-16', force_custom_clip=True)
ckpt = torch.load('LLM2CLIP-EVA02-B-16.pt')
model.load_state_dict(ckpt)
model = model.cuda().eval()
llm_model_name = 'microsoft/LLM2CLIP-Llama-3-8B-Instruct-CC-Finetuned'
config = AutoConfig.from_pretrained(
llm_model_name, trust_remote_code=True
)
llm_model = AutoModel.from_pretrained(llm_model_name, torch_dtype=torch.bfloat16, config=config, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(llm_model_name)
llm_model.config._name_or_path = 'meta-llama/Meta-Llama-3-8B-Instruct' # Workaround for LLM2VEC
l2v = LLM2Vec(llm_model, tokenizer, pooling_mode="mean", max_length=512, doc_max_length=512)
image_path = "CLIP.png"
captions = ["a diagram", "a dog", "a cat"]
image = preprocess_val(Image.open(image_path)).cuda().unsqueeze(dim=0)
text_features = l2v.encode(captions, convert_to_tensor=True).to('cuda')
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text_features)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
print("Label probs:", text_probs)Deploy This Model
Production-ready deployment in minutes
Together.ai
Instant API access to this model
Production-ready inference API. Start free, scale to millions.
Try Free APIReplicate
One-click model deployment
Run models in the cloud with simple API. No DevOps required.
Deploy NowDisclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.