colqwen-omni-v0.1
5.0K
91
1 language
license:mit
by
vidore
Other
OTHER
New
5K downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
Unknown
Mobile
Laptop
Server
Quick Summary
ColQwen2.5-Omni: Visual+Audio Retriever based on Qwen2.5-Omni-3B-Instruct with ColBERT strategy Check out the release blogpost for in-depth explanations and tu...
Code Examples
Usagebash
pip install git+https://github.com/illuin-tech/colpaliUsagebash
pip install git+https://github.com/illuin-tech/colpaliUsagebash
pip install git+https://github.com/illuin-tech/colpaliUsagebash
pip install git+https://github.com/illuin-tech/colpaliUsagebash
pip install git+https://github.com/illuin-tech/colpaliUsagebash
pip install git+https://github.com/illuin-tech/colpaliUsagebash
pip install git+https://github.com/illuin-tech/colpaliUsagebash
pip install git+https://github.com/illuin-tech/colpaliUsagebash
pip install git+https://github.com/illuin-tech/colpaliUsagebash
pip install git+https://github.com/illuin-tech/colpaliUsagebash
pip install git+https://github.com/illuin-tech/colpaliUsagebash
pip install git+https://github.com/illuin-tech/colpaliUsagebash
pip install git+https://github.com/illuin-tech/colpaliUsagebash
pip install git+https://github.com/illuin-tech/colpaliUsagebash
pip install git+https://github.com/illuin-tech/colpaliUsagebash
pip install git+https://github.com/illuin-tech/colpaliUsagebash
pip install git+https://github.com/illuin-tech/colpaliUsagebash
pip install git+https://github.com/illuin-tech/colpaliUsagebash
pip install git+https://github.com/illuin-tech/colpaliUsagebash
pip install git+https://github.com/illuin-tech/colpaliUsagebash
pip install git+https://github.com/illuin-tech/colpaliUsagebash
pip install git+https://github.com/illuin-tech/colpaliUsagebash
pip install git+https://github.com/illuin-tech/colpaliUsagebash
pip install git+https://github.com/illuin-tech/colpaliUsagebash
pip install git+https://github.com/illuin-tech/colpaliUsagebash
pip install git+https://github.com/illuin-tech/colpaliUsagebash
pip install git+https://github.com/illuin-tech/colpaliUsagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader
from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
model = ColQwen2_5Omni.from_pretrained(
"vidore/colqwen-omni-v0.1",
torch_dtype=torch.bfloat16,
device_map="cuda", # or "mps" if on Apple Silicon
attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")
dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]
dataloader = DataLoader(
dataset=audios,
batch_size=2,
shuffle=False,
collate_fn=lambda x: processor.process_audios(x),
)
ds = []
for batch_doc in tqdm(dataloader):
with torch.no_grad():
batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
embeddings_doc = model(**batch_doc)
ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))
def get_results(query: str, k=10):
batch_queries = processor.process_queries([query]).to(model.device)
# Forward pass
with torch.no_grad():
query_embeddings = model(**batch_queries)
scores = processor.score_multi_vector(query_embeddings, ds)
# get top-5 scores
return scores[0].topk(k).indices.tolist()
res = get_results("A person looking for a taxi")
# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader
from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
model = ColQwen2_5Omni.from_pretrained(
"vidore/colqwen-omni-v0.1",
torch_dtype=torch.bfloat16,
device_map="cuda", # or "mps" if on Apple Silicon
attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")
dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]
dataloader = DataLoader(
dataset=audios,
batch_size=2,
shuffle=False,
collate_fn=lambda x: processor.process_audios(x),
)
ds = []
for batch_doc in tqdm(dataloader):
with torch.no_grad():
batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
embeddings_doc = model(**batch_doc)
ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))
def get_results(query: str, k=10):
batch_queries = processor.process_queries([query]).to(model.device)
# Forward pass
with torch.no_grad():
query_embeddings = model(**batch_queries)
scores = processor.score_multi_vector(query_embeddings, ds)
# get top-5 scores
return scores[0].topk(k).indices.tolist()
res = get_results("A person looking for a taxi")
# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader
from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
model = ColQwen2_5Omni.from_pretrained(
"vidore/colqwen-omni-v0.1",
torch_dtype=torch.bfloat16,
device_map="cuda", # or "mps" if on Apple Silicon
attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")
dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]
dataloader = DataLoader(
dataset=audios,
batch_size=2,
shuffle=False,
collate_fn=lambda x: processor.process_audios(x),
)
ds = []
for batch_doc in tqdm(dataloader):
with torch.no_grad():
batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
embeddings_doc = model(**batch_doc)
ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))
def get_results(query: str, k=10):
batch_queries = processor.process_queries([query]).to(model.device)
# Forward pass
with torch.no_grad():
query_embeddings = model(**batch_queries)
scores = processor.score_multi_vector(query_embeddings, ds)
# get top-5 scores
return scores[0].topk(k).indices.tolist()
res = get_results("A person looking for a taxi")
# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader
from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
model = ColQwen2_5Omni.from_pretrained(
"vidore/colqwen-omni-v0.1",
torch_dtype=torch.bfloat16,
device_map="cuda", # or "mps" if on Apple Silicon
attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")
dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]
dataloader = DataLoader(
dataset=audios,
batch_size=2,
shuffle=False,
collate_fn=lambda x: processor.process_audios(x),
)
ds = []
for batch_doc in tqdm(dataloader):
with torch.no_grad():
batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
embeddings_doc = model(**batch_doc)
ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))
def get_results(query: str, k=10):
batch_queries = processor.process_queries([query]).to(model.device)
# Forward pass
with torch.no_grad():
query_embeddings = model(**batch_queries)
scores = processor.score_multi_vector(query_embeddings, ds)
# get top-5 scores
return scores[0].topk(k).indices.tolist()
res = get_results("A person looking for a taxi")
# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader
from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
model = ColQwen2_5Omni.from_pretrained(
"vidore/colqwen-omni-v0.1",
torch_dtype=torch.bfloat16,
device_map="cuda", # or "mps" if on Apple Silicon
attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")
dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]
dataloader = DataLoader(
dataset=audios,
batch_size=2,
shuffle=False,
collate_fn=lambda x: processor.process_audios(x),
)
ds = []
for batch_doc in tqdm(dataloader):
with torch.no_grad():
batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
embeddings_doc = model(**batch_doc)
ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))
def get_results(query: str, k=10):
batch_queries = processor.process_queries([query]).to(model.device)
# Forward pass
with torch.no_grad():
query_embeddings = model(**batch_queries)
scores = processor.score_multi_vector(query_embeddings, ds)
# get top-5 scores
return scores[0].topk(k).indices.tolist()
res = get_results("A person looking for a taxi")
# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader
from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
model = ColQwen2_5Omni.from_pretrained(
"vidore/colqwen-omni-v0.1",
torch_dtype=torch.bfloat16,
device_map="cuda", # or "mps" if on Apple Silicon
attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")
dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]
dataloader = DataLoader(
dataset=audios,
batch_size=2,
shuffle=False,
collate_fn=lambda x: processor.process_audios(x),
)
ds = []
for batch_doc in tqdm(dataloader):
with torch.no_grad():
batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
embeddings_doc = model(**batch_doc)
ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))
def get_results(query: str, k=10):
batch_queries = processor.process_queries([query]).to(model.device)
# Forward pass
with torch.no_grad():
query_embeddings = model(**batch_queries)
scores = processor.score_multi_vector(query_embeddings, ds)
# get top-5 scores
return scores[0].topk(k).indices.tolist()
res = get_results("A person looking for a taxi")
# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader
from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
model = ColQwen2_5Omni.from_pretrained(
"vidore/colqwen-omni-v0.1",
torch_dtype=torch.bfloat16,
device_map="cuda", # or "mps" if on Apple Silicon
attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")
dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]
dataloader = DataLoader(
dataset=audios,
batch_size=2,
shuffle=False,
collate_fn=lambda x: processor.process_audios(x),
)
ds = []
for batch_doc in tqdm(dataloader):
with torch.no_grad():
batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
embeddings_doc = model(**batch_doc)
ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))
def get_results(query: str, k=10):
batch_queries = processor.process_queries([query]).to(model.device)
# Forward pass
with torch.no_grad():
query_embeddings = model(**batch_queries)
scores = processor.score_multi_vector(query_embeddings, ds)
# get top-5 scores
return scores[0].topk(k).indices.tolist()
res = get_results("A person looking for a taxi")
# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader
from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
model = ColQwen2_5Omni.from_pretrained(
"vidore/colqwen-omni-v0.1",
torch_dtype=torch.bfloat16,
device_map="cuda", # or "mps" if on Apple Silicon
attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")
dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]
dataloader = DataLoader(
dataset=audios,
batch_size=2,
shuffle=False,
collate_fn=lambda x: processor.process_audios(x),
)
ds = []
for batch_doc in tqdm(dataloader):
with torch.no_grad():
batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
embeddings_doc = model(**batch_doc)
ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))
def get_results(query: str, k=10):
batch_queries = processor.process_queries([query]).to(model.device)
# Forward pass
with torch.no_grad():
query_embeddings = model(**batch_queries)
scores = processor.score_multi_vector(query_embeddings, ds)
# get top-5 scores
return scores[0].topk(k).indices.tolist()
res = get_results("A person looking for a taxi")
# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader
from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
model = ColQwen2_5Omni.from_pretrained(
"vidore/colqwen-omni-v0.1",
torch_dtype=torch.bfloat16,
device_map="cuda", # or "mps" if on Apple Silicon
attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")
dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]
dataloader = DataLoader(
dataset=audios,
batch_size=2,
shuffle=False,
collate_fn=lambda x: processor.process_audios(x),
)
ds = []
for batch_doc in tqdm(dataloader):
with torch.no_grad():
batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
embeddings_doc = model(**batch_doc)
ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))
def get_results(query: str, k=10):
batch_queries = processor.process_queries([query]).to(model.device)
# Forward pass
with torch.no_grad():
query_embeddings = model(**batch_queries)
scores = processor.score_multi_vector(query_embeddings, ds)
# get top-5 scores
return scores[0].topk(k).indices.tolist()
res = get_results("A person looking for a taxi")
# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader
from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
model = ColQwen2_5Omni.from_pretrained(
"vidore/colqwen-omni-v0.1",
torch_dtype=torch.bfloat16,
device_map="cuda", # or "mps" if on Apple Silicon
attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")
dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]
dataloader = DataLoader(
dataset=audios,
batch_size=2,
shuffle=False,
collate_fn=lambda x: processor.process_audios(x),
)
ds = []
for batch_doc in tqdm(dataloader):
with torch.no_grad():
batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
embeddings_doc = model(**batch_doc)
ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))
def get_results(query: str, k=10):
batch_queries = processor.process_queries([query]).to(model.device)
# Forward pass
with torch.no_grad():
query_embeddings = model(**batch_queries)
scores = processor.score_multi_vector(query_embeddings, ds)
# get top-5 scores
return scores[0].topk(k).indices.tolist()
res = get_results("A person looking for a taxi")
# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader
from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
model = ColQwen2_5Omni.from_pretrained(
"vidore/colqwen-omni-v0.1",
torch_dtype=torch.bfloat16,
device_map="cuda", # or "mps" if on Apple Silicon
attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")
dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]
dataloader = DataLoader(
dataset=audios,
batch_size=2,
shuffle=False,
collate_fn=lambda x: processor.process_audios(x),
)
ds = []
for batch_doc in tqdm(dataloader):
with torch.no_grad():
batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
embeddings_doc = model(**batch_doc)
ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))
def get_results(query: str, k=10):
batch_queries = processor.process_queries([query]).to(model.device)
# Forward pass
with torch.no_grad():
query_embeddings = model(**batch_queries)
scores = processor.score_multi_vector(query_embeddings, ds)
# get top-5 scores
return scores[0].topk(k).indices.tolist()
res = get_results("A person looking for a taxi")
# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader
from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
model = ColQwen2_5Omni.from_pretrained(
"vidore/colqwen-omni-v0.1",
torch_dtype=torch.bfloat16,
device_map="cuda", # or "mps" if on Apple Silicon
attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")
dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]
dataloader = DataLoader(
dataset=audios,
batch_size=2,
shuffle=False,
collate_fn=lambda x: processor.process_audios(x),
)
ds = []
for batch_doc in tqdm(dataloader):
with torch.no_grad():
batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
embeddings_doc = model(**batch_doc)
ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))
def get_results(query: str, k=10):
batch_queries = processor.process_queries([query]).to(model.device)
# Forward pass
with torch.no_grad():
query_embeddings = model(**batch_queries)
scores = processor.score_multi_vector(query_embeddings, ds)
# get top-5 scores
return scores[0].topk(k).indices.tolist()
res = get_results("A person looking for a taxi")
# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader
from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
model = ColQwen2_5Omni.from_pretrained(
"vidore/colqwen-omni-v0.1",
torch_dtype=torch.bfloat16,
device_map="cuda", # or "mps" if on Apple Silicon
attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")
dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]
dataloader = DataLoader(
dataset=audios,
batch_size=2,
shuffle=False,
collate_fn=lambda x: processor.process_audios(x),
)
ds = []
for batch_doc in tqdm(dataloader):
with torch.no_grad():
batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
embeddings_doc = model(**batch_doc)
ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))
def get_results(query: str, k=10):
batch_queries = processor.process_queries([query]).to(model.device)
# Forward pass
with torch.no_grad():
query_embeddings = model(**batch_queries)
scores = processor.score_multi_vector(query_embeddings, ds)
# get top-5 scores
return scores[0].topk(k).indices.tolist()
res = get_results("A person looking for a taxi")
# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader
from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
model = ColQwen2_5Omni.from_pretrained(
"vidore/colqwen-omni-v0.1",
torch_dtype=torch.bfloat16,
device_map="cuda", # or "mps" if on Apple Silicon
attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")
dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]
dataloader = DataLoader(
dataset=audios,
batch_size=2,
shuffle=False,
collate_fn=lambda x: processor.process_audios(x),
)
ds = []
for batch_doc in tqdm(dataloader):
with torch.no_grad():
batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
embeddings_doc = model(**batch_doc)
ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))
def get_results(query: str, k=10):
batch_queries = processor.process_queries([query]).to(model.device)
# Forward pass
with torch.no_grad():
query_embeddings = model(**batch_queries)
scores = processor.score_multi_vector(query_embeddings, ds)
# get top-5 scores
return scores[0].topk(k).indices.tolist()
res = get_results("A person looking for a taxi")
# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader
from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
model = ColQwen2_5Omni.from_pretrained(
"vidore/colqwen-omni-v0.1",
torch_dtype=torch.bfloat16,
device_map="cuda", # or "mps" if on Apple Silicon
attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")
dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]
dataloader = DataLoader(
dataset=audios,
batch_size=2,
shuffle=False,
collate_fn=lambda x: processor.process_audios(x),
)
ds = []
for batch_doc in tqdm(dataloader):
with torch.no_grad():
batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
embeddings_doc = model(**batch_doc)
ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))
def get_results(query: str, k=10):
batch_queries = processor.process_queries([query]).to(model.device)
# Forward pass
with torch.no_grad():
query_embeddings = model(**batch_queries)
scores = processor.score_multi_vector(query_embeddings, ds)
# get top-5 scores
return scores[0].topk(k).indices.tolist()
res = get_results("A person looking for a taxi")
# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader
from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
model = ColQwen2_5Omni.from_pretrained(
"vidore/colqwen-omni-v0.1",
torch_dtype=torch.bfloat16,
device_map="cuda", # or "mps" if on Apple Silicon
attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")
dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]
dataloader = DataLoader(
dataset=audios,
batch_size=2,
shuffle=False,
collate_fn=lambda x: processor.process_audios(x),
)
ds = []
for batch_doc in tqdm(dataloader):
with torch.no_grad():
batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
embeddings_doc = model(**batch_doc)
ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))
def get_results(query: str, k=10):
batch_queries = processor.process_queries([query]).to(model.device)
# Forward pass
with torch.no_grad():
query_embeddings = model(**batch_queries)
scores = processor.score_multi_vector(query_embeddings, ds)
# get top-5 scores
return scores[0].topk(k).indices.tolist()
res = get_results("A person looking for a taxi")
# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader
from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
model = ColQwen2_5Omni.from_pretrained(
"vidore/colqwen-omni-v0.1",
torch_dtype=torch.bfloat16,
device_map="cuda", # or "mps" if on Apple Silicon
attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")
dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]
dataloader = DataLoader(
dataset=audios,
batch_size=2,
shuffle=False,
collate_fn=lambda x: processor.process_audios(x),
)
ds = []
for batch_doc in tqdm(dataloader):
with torch.no_grad():
batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
embeddings_doc = model(**batch_doc)
ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))
def get_results(query: str, k=10):
batch_queries = processor.process_queries([query]).to(model.device)
# Forward pass
with torch.no_grad():
query_embeddings = model(**batch_queries)
scores = processor.score_multi_vector(query_embeddings, ds)
# get top-5 scores
return scores[0].topk(k).indices.tolist()
res = get_results("A person looking for a taxi")
# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader
from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
model = ColQwen2_5Omni.from_pretrained(
"vidore/colqwen-omni-v0.1",
torch_dtype=torch.bfloat16,
device_map="cuda", # or "mps" if on Apple Silicon
attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")
dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]
dataloader = DataLoader(
dataset=audios,
batch_size=2,
shuffle=False,
collate_fn=lambda x: processor.process_audios(x),
)
ds = []
for batch_doc in tqdm(dataloader):
with torch.no_grad():
batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
embeddings_doc = model(**batch_doc)
ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))
def get_results(query: str, k=10):
batch_queries = processor.process_queries([query]).to(model.device)
# Forward pass
with torch.no_grad():
query_embeddings = model(**batch_queries)
scores = processor.score_multi_vector(query_embeddings, ds)
# get top-5 scores
return scores[0].topk(k).indices.tolist()
res = get_results("A person looking for a taxi")
# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader
from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
model = ColQwen2_5Omni.from_pretrained(
"vidore/colqwen-omni-v0.1",
torch_dtype=torch.bfloat16,
device_map="cuda", # or "mps" if on Apple Silicon
attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")
dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]
dataloader = DataLoader(
dataset=audios,
batch_size=2,
shuffle=False,
collate_fn=lambda x: processor.process_audios(x),
)
ds = []
for batch_doc in tqdm(dataloader):
with torch.no_grad():
batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
embeddings_doc = model(**batch_doc)
ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))
def get_results(query: str, k=10):
batch_queries = processor.process_queries([query]).to(model.device)
# Forward pass
with torch.no_grad():
query_embeddings = model(**batch_queries)
scores = processor.score_multi_vector(query_embeddings, ds)
# get top-5 scores
return scores[0].topk(k).indices.tolist()
res = get_results("A person looking for a taxi")
# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader
from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
model = ColQwen2_5Omni.from_pretrained(
"vidore/colqwen-omni-v0.1",
torch_dtype=torch.bfloat16,
device_map="cuda", # or "mps" if on Apple Silicon
attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")
dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]
dataloader = DataLoader(
dataset=audios,
batch_size=2,
shuffle=False,
collate_fn=lambda x: processor.process_audios(x),
)
ds = []
for batch_doc in tqdm(dataloader):
with torch.no_grad():
batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
embeddings_doc = model(**batch_doc)
ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))
def get_results(query: str, k=10):
batch_queries = processor.process_queries([query]).to(model.device)
# Forward pass
with torch.no_grad():
query_embeddings = model(**batch_queries)
scores = processor.score_multi_vector(query_embeddings, ds)
# get top-5 scores
return scores[0].topk(k).indices.tolist()
res = get_results("A person looking for a taxi")
# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader
from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
model = ColQwen2_5Omni.from_pretrained(
"vidore/colqwen-omni-v0.1",
torch_dtype=torch.bfloat16,
device_map="cuda", # or "mps" if on Apple Silicon
attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")
dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]
dataloader = DataLoader(
dataset=audios,
batch_size=2,
shuffle=False,
collate_fn=lambda x: processor.process_audios(x),
)
ds = []
for batch_doc in tqdm(dataloader):
with torch.no_grad():
batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
embeddings_doc = model(**batch_doc)
ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))
def get_results(query: str, k=10):
batch_queries = processor.process_queries([query]).to(model.device)
# Forward pass
with torch.no_grad():
query_embeddings = model(**batch_queries)
scores = processor.score_multi_vector(query_embeddings, ds)
# get top-5 scores
return scores[0].topk(k).indices.tolist()
res = get_results("A person looking for a taxi")
# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader
from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
model = ColQwen2_5Omni.from_pretrained(
"vidore/colqwen-omni-v0.1",
torch_dtype=torch.bfloat16,
device_map="cuda", # or "mps" if on Apple Silicon
attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")
dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]
dataloader = DataLoader(
dataset=audios,
batch_size=2,
shuffle=False,
collate_fn=lambda x: processor.process_audios(x),
)
ds = []
for batch_doc in tqdm(dataloader):
with torch.no_grad():
batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
embeddings_doc = model(**batch_doc)
ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))
def get_results(query: str, k=10):
batch_queries = processor.process_queries([query]).to(model.device)
# Forward pass
with torch.no_grad():
query_embeddings = model(**batch_queries)
scores = processor.score_multi_vector(query_embeddings, ds)
# get top-5 scores
return scores[0].topk(k).indices.tolist()
res = get_results("A person looking for a taxi")
# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader
from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
model = ColQwen2_5Omni.from_pretrained(
"vidore/colqwen-omni-v0.1",
torch_dtype=torch.bfloat16,
device_map="cuda", # or "mps" if on Apple Silicon
attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")
dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]
dataloader = DataLoader(
dataset=audios,
batch_size=2,
shuffle=False,
collate_fn=lambda x: processor.process_audios(x),
)
ds = []
for batch_doc in tqdm(dataloader):
with torch.no_grad():
batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
embeddings_doc = model(**batch_doc)
ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))
def get_results(query: str, k=10):
batch_queries = processor.process_queries([query]).to(model.device)
# Forward pass
with torch.no_grad():
query_embeddings = model(**batch_queries)
scores = processor.score_multi_vector(query_embeddings, ds)
# get top-5 scores
return scores[0].topk(k).indices.tolist()
res = get_results("A person looking for a taxi")
# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader
from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
model = ColQwen2_5Omni.from_pretrained(
"vidore/colqwen-omni-v0.1",
torch_dtype=torch.bfloat16,
device_map="cuda", # or "mps" if on Apple Silicon
attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")
dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]
dataloader = DataLoader(
dataset=audios,
batch_size=2,
shuffle=False,
collate_fn=lambda x: processor.process_audios(x),
)
ds = []
for batch_doc in tqdm(dataloader):
with torch.no_grad():
batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
embeddings_doc = model(**batch_doc)
ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))
def get_results(query: str, k=10):
batch_queries = processor.process_queries([query]).to(model.device)
# Forward pass
with torch.no_grad():
query_embeddings = model(**batch_queries)
scores = processor.score_multi_vector(query_embeddings, ds)
# get top-5 scores
return scores[0].topk(k).indices.tolist()
res = get_results("A person looking for a taxi")
# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader
from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
model = ColQwen2_5Omni.from_pretrained(
"vidore/colqwen-omni-v0.1",
torch_dtype=torch.bfloat16,
device_map="cuda", # or "mps" if on Apple Silicon
attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")
dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]
dataloader = DataLoader(
dataset=audios,
batch_size=2,
shuffle=False,
collate_fn=lambda x: processor.process_audios(x),
)
ds = []
for batch_doc in tqdm(dataloader):
with torch.no_grad():
batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
embeddings_doc = model(**batch_doc)
ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))
def get_results(query: str, k=10):
batch_queries = processor.process_queries([query]).to(model.device)
# Forward pass
with torch.no_grad():
query_embeddings = model(**batch_queries)
scores = processor.score_multi_vector(query_embeddings, ds)
# get top-5 scores
return scores[0].topk(k).indices.tolist()
res = get_results("A person looking for a taxi")
# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader
from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
model = ColQwen2_5Omni.from_pretrained(
"vidore/colqwen-omni-v0.1",
torch_dtype=torch.bfloat16,
device_map="cuda", # or "mps" if on Apple Silicon
attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")
dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]
dataloader = DataLoader(
dataset=audios,
batch_size=2,
shuffle=False,
collate_fn=lambda x: processor.process_audios(x),
)
ds = []
for batch_doc in tqdm(dataloader):
with torch.no_grad():
batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
embeddings_doc = model(**batch_doc)
ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))
def get_results(query: str, k=10):
batch_queries = processor.process_queries([query]).to(model.device)
# Forward pass
with torch.no_grad():
query_embeddings = model(**batch_queries)
scores = processor.score_multi_vector(query_embeddings, ds)
# get top-5 scores
return scores[0].topk(k).indices.tolist()
res = get_results("A person looking for a taxi")
# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader
from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor
model = ColQwen2_5Omni.from_pretrained(
"vidore/colqwen-omni-v0.1",
torch_dtype=torch.bfloat16,
device_map="cuda", # or "mps" if on Apple Silicon
attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")
dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]
dataloader = DataLoader(
dataset=audios,
batch_size=2,
shuffle=False,
collate_fn=lambda x: processor.process_audios(x),
)
ds = []
for batch_doc in tqdm(dataloader):
with torch.no_grad():
batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
embeddings_doc = model(**batch_doc)
ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))
def get_results(query: str, k=10):
batch_queries = processor.process_queries([query]).to(model.device)
# Forward pass
with torch.no_grad():
query_embeddings = model(**batch_queries)
scores = processor.score_multi_vector(query_embeddings, ds)
# get top-5 scores
return scores[0].topk(k).indices.tolist()
res = get_results("A person looking for a taxi")
# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))Deploy This Model
Production-ready deployment in minutes
Together.ai
Instant API access to this model
Production-ready inference API. Start free, scale to millions.
Try Free APIReplicate
One-click model deployment
Run models in the cloud with simple API. No DevOps required.
Deploy NowDisclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.