colqwen-omni-v0.1

5.0K
91
1 language
license:mit
by
vidore
Other
OTHER
New
5K downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
Unknown
Mobile
Laptop
Server
Quick Summary

ColQwen2.5-Omni: Visual+Audio Retriever based on Qwen2.5-Omni-3B-Instruct with ColBERT strategy Check out the release blogpost for in-depth explanations and tu...

Code Examples

Usagebash
pip install git+https://github.com/illuin-tech/colpali
Usagebash
pip install git+https://github.com/illuin-tech/colpali
Usagebash
pip install git+https://github.com/illuin-tech/colpali
Usagebash
pip install git+https://github.com/illuin-tech/colpali
Usagebash
pip install git+https://github.com/illuin-tech/colpali
Usagebash
pip install git+https://github.com/illuin-tech/colpali
Usagebash
pip install git+https://github.com/illuin-tech/colpali
Usagebash
pip install git+https://github.com/illuin-tech/colpali
Usagebash
pip install git+https://github.com/illuin-tech/colpali
Usagebash
pip install git+https://github.com/illuin-tech/colpali
Usagebash
pip install git+https://github.com/illuin-tech/colpali
Usagebash
pip install git+https://github.com/illuin-tech/colpali
Usagebash
pip install git+https://github.com/illuin-tech/colpali
Usagebash
pip install git+https://github.com/illuin-tech/colpali
Usagebash
pip install git+https://github.com/illuin-tech/colpali
Usagebash
pip install git+https://github.com/illuin-tech/colpali
Usagebash
pip install git+https://github.com/illuin-tech/colpali
Usagebash
pip install git+https://github.com/illuin-tech/colpali
Usagebash
pip install git+https://github.com/illuin-tech/colpali
Usagebash
pip install git+https://github.com/illuin-tech/colpali
Usagebash
pip install git+https://github.com/illuin-tech/colpali
Usagebash
pip install git+https://github.com/illuin-tech/colpali
Usagebash
pip install git+https://github.com/illuin-tech/colpali
Usagebash
pip install git+https://github.com/illuin-tech/colpali
Usagebash
pip install git+https://github.com/illuin-tech/colpali
Usagebash
pip install git+https://github.com/illuin-tech/colpali
Usagebash
pip install git+https://github.com/illuin-tech/colpali
Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader

from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor

model = ColQwen2_5Omni.from_pretrained(
    "vidore/colqwen-omni-v0.1",
    torch_dtype=torch.bfloat16,
    device_map="cuda",  # or "mps" if on Apple Silicon
    attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")

dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]


dataloader = DataLoader(
    dataset=audios,
    batch_size=2,
    shuffle=False,
    collate_fn=lambda x: processor.process_audios(x),
)

ds  = []
for batch_doc in tqdm(dataloader):
    with torch.no_grad():
        batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
        embeddings_doc = model(**batch_doc)
    ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))

def get_results(query: str, k=10):
    batch_queries = processor.process_queries([query]).to(model.device)

    # Forward pass
    with torch.no_grad():
        query_embeddings = model(**batch_queries)

    scores = processor.score_multi_vector(query_embeddings, ds)
    # get top-5 scores
    return scores[0].topk(k).indices.tolist()

res = get_results("A person looking for a taxi")

# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))
Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader

from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor

model = ColQwen2_5Omni.from_pretrained(
    "vidore/colqwen-omni-v0.1",
    torch_dtype=torch.bfloat16,
    device_map="cuda",  # or "mps" if on Apple Silicon
    attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")

dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]


dataloader = DataLoader(
    dataset=audios,
    batch_size=2,
    shuffle=False,
    collate_fn=lambda x: processor.process_audios(x),
)

ds  = []
for batch_doc in tqdm(dataloader):
    with torch.no_grad():
        batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
        embeddings_doc = model(**batch_doc)
    ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))

def get_results(query: str, k=10):
    batch_queries = processor.process_queries([query]).to(model.device)

    # Forward pass
    with torch.no_grad():
        query_embeddings = model(**batch_queries)

    scores = processor.score_multi_vector(query_embeddings, ds)
    # get top-5 scores
    return scores[0].topk(k).indices.tolist()

res = get_results("A person looking for a taxi")

# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))
Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader

from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor

model = ColQwen2_5Omni.from_pretrained(
    "vidore/colqwen-omni-v0.1",
    torch_dtype=torch.bfloat16,
    device_map="cuda",  # or "mps" if on Apple Silicon
    attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")

dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]


dataloader = DataLoader(
    dataset=audios,
    batch_size=2,
    shuffle=False,
    collate_fn=lambda x: processor.process_audios(x),
)

ds  = []
for batch_doc in tqdm(dataloader):
    with torch.no_grad():
        batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
        embeddings_doc = model(**batch_doc)
    ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))

def get_results(query: str, k=10):
    batch_queries = processor.process_queries([query]).to(model.device)

    # Forward pass
    with torch.no_grad():
        query_embeddings = model(**batch_queries)

    scores = processor.score_multi_vector(query_embeddings, ds)
    # get top-5 scores
    return scores[0].topk(k).indices.tolist()

res = get_results("A person looking for a taxi")

# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))
Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader

from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor

model = ColQwen2_5Omni.from_pretrained(
    "vidore/colqwen-omni-v0.1",
    torch_dtype=torch.bfloat16,
    device_map="cuda",  # or "mps" if on Apple Silicon
    attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")

dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]


dataloader = DataLoader(
    dataset=audios,
    batch_size=2,
    shuffle=False,
    collate_fn=lambda x: processor.process_audios(x),
)

ds  = []
for batch_doc in tqdm(dataloader):
    with torch.no_grad():
        batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
        embeddings_doc = model(**batch_doc)
    ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))

def get_results(query: str, k=10):
    batch_queries = processor.process_queries([query]).to(model.device)

    # Forward pass
    with torch.no_grad():
        query_embeddings = model(**batch_queries)

    scores = processor.score_multi_vector(query_embeddings, ds)
    # get top-5 scores
    return scores[0].topk(k).indices.tolist()

res = get_results("A person looking for a taxi")

# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))
Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader

from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor

model = ColQwen2_5Omni.from_pretrained(
    "vidore/colqwen-omni-v0.1",
    torch_dtype=torch.bfloat16,
    device_map="cuda",  # or "mps" if on Apple Silicon
    attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")

dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]


dataloader = DataLoader(
    dataset=audios,
    batch_size=2,
    shuffle=False,
    collate_fn=lambda x: processor.process_audios(x),
)

ds  = []
for batch_doc in tqdm(dataloader):
    with torch.no_grad():
        batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
        embeddings_doc = model(**batch_doc)
    ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))

def get_results(query: str, k=10):
    batch_queries = processor.process_queries([query]).to(model.device)

    # Forward pass
    with torch.no_grad():
        query_embeddings = model(**batch_queries)

    scores = processor.score_multi_vector(query_embeddings, ds)
    # get top-5 scores
    return scores[0].topk(k).indices.tolist()

res = get_results("A person looking for a taxi")

# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))
Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader

from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor

model = ColQwen2_5Omni.from_pretrained(
    "vidore/colqwen-omni-v0.1",
    torch_dtype=torch.bfloat16,
    device_map="cuda",  # or "mps" if on Apple Silicon
    attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")

dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]


dataloader = DataLoader(
    dataset=audios,
    batch_size=2,
    shuffle=False,
    collate_fn=lambda x: processor.process_audios(x),
)

ds  = []
for batch_doc in tqdm(dataloader):
    with torch.no_grad():
        batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
        embeddings_doc = model(**batch_doc)
    ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))

def get_results(query: str, k=10):
    batch_queries = processor.process_queries([query]).to(model.device)

    # Forward pass
    with torch.no_grad():
        query_embeddings = model(**batch_queries)

    scores = processor.score_multi_vector(query_embeddings, ds)
    # get top-5 scores
    return scores[0].topk(k).indices.tolist()

res = get_results("A person looking for a taxi")

# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))
Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader

from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor

model = ColQwen2_5Omni.from_pretrained(
    "vidore/colqwen-omni-v0.1",
    torch_dtype=torch.bfloat16,
    device_map="cuda",  # or "mps" if on Apple Silicon
    attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")

dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]


dataloader = DataLoader(
    dataset=audios,
    batch_size=2,
    shuffle=False,
    collate_fn=lambda x: processor.process_audios(x),
)

ds  = []
for batch_doc in tqdm(dataloader):
    with torch.no_grad():
        batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
        embeddings_doc = model(**batch_doc)
    ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))

def get_results(query: str, k=10):
    batch_queries = processor.process_queries([query]).to(model.device)

    # Forward pass
    with torch.no_grad():
        query_embeddings = model(**batch_queries)

    scores = processor.score_multi_vector(query_embeddings, ds)
    # get top-5 scores
    return scores[0].topk(k).indices.tolist()

res = get_results("A person looking for a taxi")

# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))
Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader

from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor

model = ColQwen2_5Omni.from_pretrained(
    "vidore/colqwen-omni-v0.1",
    torch_dtype=torch.bfloat16,
    device_map="cuda",  # or "mps" if on Apple Silicon
    attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")

dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]


dataloader = DataLoader(
    dataset=audios,
    batch_size=2,
    shuffle=False,
    collate_fn=lambda x: processor.process_audios(x),
)

ds  = []
for batch_doc in tqdm(dataloader):
    with torch.no_grad():
        batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
        embeddings_doc = model(**batch_doc)
    ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))

def get_results(query: str, k=10):
    batch_queries = processor.process_queries([query]).to(model.device)

    # Forward pass
    with torch.no_grad():
        query_embeddings = model(**batch_queries)

    scores = processor.score_multi_vector(query_embeddings, ds)
    # get top-5 scores
    return scores[0].topk(k).indices.tolist()

res = get_results("A person looking for a taxi")

# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))
Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader

from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor

model = ColQwen2_5Omni.from_pretrained(
    "vidore/colqwen-omni-v0.1",
    torch_dtype=torch.bfloat16,
    device_map="cuda",  # or "mps" if on Apple Silicon
    attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")

dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]


dataloader = DataLoader(
    dataset=audios,
    batch_size=2,
    shuffle=False,
    collate_fn=lambda x: processor.process_audios(x),
)

ds  = []
for batch_doc in tqdm(dataloader):
    with torch.no_grad():
        batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
        embeddings_doc = model(**batch_doc)
    ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))

def get_results(query: str, k=10):
    batch_queries = processor.process_queries([query]).to(model.device)

    # Forward pass
    with torch.no_grad():
        query_embeddings = model(**batch_queries)

    scores = processor.score_multi_vector(query_embeddings, ds)
    # get top-5 scores
    return scores[0].topk(k).indices.tolist()

res = get_results("A person looking for a taxi")

# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))
Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader

from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor

model = ColQwen2_5Omni.from_pretrained(
    "vidore/colqwen-omni-v0.1",
    torch_dtype=torch.bfloat16,
    device_map="cuda",  # or "mps" if on Apple Silicon
    attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")

dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]


dataloader = DataLoader(
    dataset=audios,
    batch_size=2,
    shuffle=False,
    collate_fn=lambda x: processor.process_audios(x),
)

ds  = []
for batch_doc in tqdm(dataloader):
    with torch.no_grad():
        batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
        embeddings_doc = model(**batch_doc)
    ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))

def get_results(query: str, k=10):
    batch_queries = processor.process_queries([query]).to(model.device)

    # Forward pass
    with torch.no_grad():
        query_embeddings = model(**batch_queries)

    scores = processor.score_multi_vector(query_embeddings, ds)
    # get top-5 scores
    return scores[0].topk(k).indices.tolist()

res = get_results("A person looking for a taxi")

# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))
Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader

from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor

model = ColQwen2_5Omni.from_pretrained(
    "vidore/colqwen-omni-v0.1",
    torch_dtype=torch.bfloat16,
    device_map="cuda",  # or "mps" if on Apple Silicon
    attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")

dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]


dataloader = DataLoader(
    dataset=audios,
    batch_size=2,
    shuffle=False,
    collate_fn=lambda x: processor.process_audios(x),
)

ds  = []
for batch_doc in tqdm(dataloader):
    with torch.no_grad():
        batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
        embeddings_doc = model(**batch_doc)
    ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))

def get_results(query: str, k=10):
    batch_queries = processor.process_queries([query]).to(model.device)

    # Forward pass
    with torch.no_grad():
        query_embeddings = model(**batch_queries)

    scores = processor.score_multi_vector(query_embeddings, ds)
    # get top-5 scores
    return scores[0].topk(k).indices.tolist()

res = get_results("A person looking for a taxi")

# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))
Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader

from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor

model = ColQwen2_5Omni.from_pretrained(
    "vidore/colqwen-omni-v0.1",
    torch_dtype=torch.bfloat16,
    device_map="cuda",  # or "mps" if on Apple Silicon
    attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")

dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]


dataloader = DataLoader(
    dataset=audios,
    batch_size=2,
    shuffle=False,
    collate_fn=lambda x: processor.process_audios(x),
)

ds  = []
for batch_doc in tqdm(dataloader):
    with torch.no_grad():
        batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
        embeddings_doc = model(**batch_doc)
    ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))

def get_results(query: str, k=10):
    batch_queries = processor.process_queries([query]).to(model.device)

    # Forward pass
    with torch.no_grad():
        query_embeddings = model(**batch_queries)

    scores = processor.score_multi_vector(query_embeddings, ds)
    # get top-5 scores
    return scores[0].topk(k).indices.tolist()

res = get_results("A person looking for a taxi")

# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))
Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader

from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor

model = ColQwen2_5Omni.from_pretrained(
    "vidore/colqwen-omni-v0.1",
    torch_dtype=torch.bfloat16,
    device_map="cuda",  # or "mps" if on Apple Silicon
    attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")

dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]


dataloader = DataLoader(
    dataset=audios,
    batch_size=2,
    shuffle=False,
    collate_fn=lambda x: processor.process_audios(x),
)

ds  = []
for batch_doc in tqdm(dataloader):
    with torch.no_grad():
        batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
        embeddings_doc = model(**batch_doc)
    ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))

def get_results(query: str, k=10):
    batch_queries = processor.process_queries([query]).to(model.device)

    # Forward pass
    with torch.no_grad():
        query_embeddings = model(**batch_queries)

    scores = processor.score_multi_vector(query_embeddings, ds)
    # get top-5 scores
    return scores[0].topk(k).indices.tolist()

res = get_results("A person looking for a taxi")

# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))
Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader

from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor

model = ColQwen2_5Omni.from_pretrained(
    "vidore/colqwen-omni-v0.1",
    torch_dtype=torch.bfloat16,
    device_map="cuda",  # or "mps" if on Apple Silicon
    attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")

dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]


dataloader = DataLoader(
    dataset=audios,
    batch_size=2,
    shuffle=False,
    collate_fn=lambda x: processor.process_audios(x),
)

ds  = []
for batch_doc in tqdm(dataloader):
    with torch.no_grad():
        batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
        embeddings_doc = model(**batch_doc)
    ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))

def get_results(query: str, k=10):
    batch_queries = processor.process_queries([query]).to(model.device)

    # Forward pass
    with torch.no_grad():
        query_embeddings = model(**batch_queries)

    scores = processor.score_multi_vector(query_embeddings, ds)
    # get top-5 scores
    return scores[0].topk(k).indices.tolist()

res = get_results("A person looking for a taxi")

# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))
Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader

from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor

model = ColQwen2_5Omni.from_pretrained(
    "vidore/colqwen-omni-v0.1",
    torch_dtype=torch.bfloat16,
    device_map="cuda",  # or "mps" if on Apple Silicon
    attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")

dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]


dataloader = DataLoader(
    dataset=audios,
    batch_size=2,
    shuffle=False,
    collate_fn=lambda x: processor.process_audios(x),
)

ds  = []
for batch_doc in tqdm(dataloader):
    with torch.no_grad():
        batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
        embeddings_doc = model(**batch_doc)
    ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))

def get_results(query: str, k=10):
    batch_queries = processor.process_queries([query]).to(model.device)

    # Forward pass
    with torch.no_grad():
        query_embeddings = model(**batch_queries)

    scores = processor.score_multi_vector(query_embeddings, ds)
    # get top-5 scores
    return scores[0].topk(k).indices.tolist()

res = get_results("A person looking for a taxi")

# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))
Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader

from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor

model = ColQwen2_5Omni.from_pretrained(
    "vidore/colqwen-omni-v0.1",
    torch_dtype=torch.bfloat16,
    device_map="cuda",  # or "mps" if on Apple Silicon
    attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")

dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]


dataloader = DataLoader(
    dataset=audios,
    batch_size=2,
    shuffle=False,
    collate_fn=lambda x: processor.process_audios(x),
)

ds  = []
for batch_doc in tqdm(dataloader):
    with torch.no_grad():
        batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
        embeddings_doc = model(**batch_doc)
    ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))

def get_results(query: str, k=10):
    batch_queries = processor.process_queries([query]).to(model.device)

    # Forward pass
    with torch.no_grad():
        query_embeddings = model(**batch_queries)

    scores = processor.score_multi_vector(query_embeddings, ds)
    # get top-5 scores
    return scores[0].topk(k).indices.tolist()

res = get_results("A person looking for a taxi")

# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))
Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader

from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor

model = ColQwen2_5Omni.from_pretrained(
    "vidore/colqwen-omni-v0.1",
    torch_dtype=torch.bfloat16,
    device_map="cuda",  # or "mps" if on Apple Silicon
    attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")

dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]


dataloader = DataLoader(
    dataset=audios,
    batch_size=2,
    shuffle=False,
    collate_fn=lambda x: processor.process_audios(x),
)

ds  = []
for batch_doc in tqdm(dataloader):
    with torch.no_grad():
        batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
        embeddings_doc = model(**batch_doc)
    ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))

def get_results(query: str, k=10):
    batch_queries = processor.process_queries([query]).to(model.device)

    # Forward pass
    with torch.no_grad():
        query_embeddings = model(**batch_queries)

    scores = processor.score_multi_vector(query_embeddings, ds)
    # get top-5 scores
    return scores[0].topk(k).indices.tolist()

res = get_results("A person looking for a taxi")

# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))
Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader

from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor

model = ColQwen2_5Omni.from_pretrained(
    "vidore/colqwen-omni-v0.1",
    torch_dtype=torch.bfloat16,
    device_map="cuda",  # or "mps" if on Apple Silicon
    attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")

dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]


dataloader = DataLoader(
    dataset=audios,
    batch_size=2,
    shuffle=False,
    collate_fn=lambda x: processor.process_audios(x),
)

ds  = []
for batch_doc in tqdm(dataloader):
    with torch.no_grad():
        batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
        embeddings_doc = model(**batch_doc)
    ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))

def get_results(query: str, k=10):
    batch_queries = processor.process_queries([query]).to(model.device)

    # Forward pass
    with torch.no_grad():
        query_embeddings = model(**batch_queries)

    scores = processor.score_multi_vector(query_embeddings, ds)
    # get top-5 scores
    return scores[0].topk(k).indices.tolist()

res = get_results("A person looking for a taxi")

# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))
Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader

from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor

model = ColQwen2_5Omni.from_pretrained(
    "vidore/colqwen-omni-v0.1",
    torch_dtype=torch.bfloat16,
    device_map="cuda",  # or "mps" if on Apple Silicon
    attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")

dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]


dataloader = DataLoader(
    dataset=audios,
    batch_size=2,
    shuffle=False,
    collate_fn=lambda x: processor.process_audios(x),
)

ds  = []
for batch_doc in tqdm(dataloader):
    with torch.no_grad():
        batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
        embeddings_doc = model(**batch_doc)
    ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))

def get_results(query: str, k=10):
    batch_queries = processor.process_queries([query]).to(model.device)

    # Forward pass
    with torch.no_grad():
        query_embeddings = model(**batch_queries)

    scores = processor.score_multi_vector(query_embeddings, ds)
    # get top-5 scores
    return scores[0].topk(k).indices.tolist()

res = get_results("A person looking for a taxi")

# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))
Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader

from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor

model = ColQwen2_5Omni.from_pretrained(
    "vidore/colqwen-omni-v0.1",
    torch_dtype=torch.bfloat16,
    device_map="cuda",  # or "mps" if on Apple Silicon
    attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")

dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]


dataloader = DataLoader(
    dataset=audios,
    batch_size=2,
    shuffle=False,
    collate_fn=lambda x: processor.process_audios(x),
)

ds  = []
for batch_doc in tqdm(dataloader):
    with torch.no_grad():
        batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
        embeddings_doc = model(**batch_doc)
    ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))

def get_results(query: str, k=10):
    batch_queries = processor.process_queries([query]).to(model.device)

    # Forward pass
    with torch.no_grad():
        query_embeddings = model(**batch_queries)

    scores = processor.score_multi_vector(query_embeddings, ds)
    # get top-5 scores
    return scores[0].topk(k).indices.tolist()

res = get_results("A person looking for a taxi")

# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))
Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader

from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor

model = ColQwen2_5Omni.from_pretrained(
    "vidore/colqwen-omni-v0.1",
    torch_dtype=torch.bfloat16,
    device_map="cuda",  # or "mps" if on Apple Silicon
    attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")

dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]


dataloader = DataLoader(
    dataset=audios,
    batch_size=2,
    shuffle=False,
    collate_fn=lambda x: processor.process_audios(x),
)

ds  = []
for batch_doc in tqdm(dataloader):
    with torch.no_grad():
        batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
        embeddings_doc = model(**batch_doc)
    ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))

def get_results(query: str, k=10):
    batch_queries = processor.process_queries([query]).to(model.device)

    # Forward pass
    with torch.no_grad():
        query_embeddings = model(**batch_queries)

    scores = processor.score_multi_vector(query_embeddings, ds)
    # get top-5 scores
    return scores[0].topk(k).indices.tolist()

res = get_results("A person looking for a taxi")

# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))
Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader

from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor

model = ColQwen2_5Omni.from_pretrained(
    "vidore/colqwen-omni-v0.1",
    torch_dtype=torch.bfloat16,
    device_map="cuda",  # or "mps" if on Apple Silicon
    attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")

dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]


dataloader = DataLoader(
    dataset=audios,
    batch_size=2,
    shuffle=False,
    collate_fn=lambda x: processor.process_audios(x),
)

ds  = []
for batch_doc in tqdm(dataloader):
    with torch.no_grad():
        batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
        embeddings_doc = model(**batch_doc)
    ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))

def get_results(query: str, k=10):
    batch_queries = processor.process_queries([query]).to(model.device)

    # Forward pass
    with torch.no_grad():
        query_embeddings = model(**batch_queries)

    scores = processor.score_multi_vector(query_embeddings, ds)
    # get top-5 scores
    return scores[0].topk(k).indices.tolist()

res = get_results("A person looking for a taxi")

# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))
Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader

from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor

model = ColQwen2_5Omni.from_pretrained(
    "vidore/colqwen-omni-v0.1",
    torch_dtype=torch.bfloat16,
    device_map="cuda",  # or "mps" if on Apple Silicon
    attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")

dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]


dataloader = DataLoader(
    dataset=audios,
    batch_size=2,
    shuffle=False,
    collate_fn=lambda x: processor.process_audios(x),
)

ds  = []
for batch_doc in tqdm(dataloader):
    with torch.no_grad():
        batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
        embeddings_doc = model(**batch_doc)
    ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))

def get_results(query: str, k=10):
    batch_queries = processor.process_queries([query]).to(model.device)

    # Forward pass
    with torch.no_grad():
        query_embeddings = model(**batch_queries)

    scores = processor.score_multi_vector(query_embeddings, ds)
    # get top-5 scores
    return scores[0].topk(k).indices.tolist()

res = get_results("A person looking for a taxi")

# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))
Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader

from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor

model = ColQwen2_5Omni.from_pretrained(
    "vidore/colqwen-omni-v0.1",
    torch_dtype=torch.bfloat16,
    device_map="cuda",  # or "mps" if on Apple Silicon
    attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")

dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]


dataloader = DataLoader(
    dataset=audios,
    batch_size=2,
    shuffle=False,
    collate_fn=lambda x: processor.process_audios(x),
)

ds  = []
for batch_doc in tqdm(dataloader):
    with torch.no_grad():
        batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
        embeddings_doc = model(**batch_doc)
    ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))

def get_results(query: str, k=10):
    batch_queries = processor.process_queries([query]).to(model.device)

    # Forward pass
    with torch.no_grad():
        query_embeddings = model(**batch_queries)

    scores = processor.score_multi_vector(query_embeddings, ds)
    # get top-5 scores
    return scores[0].topk(k).indices.tolist()

res = get_results("A person looking for a taxi")

# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))
Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader

from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor

model = ColQwen2_5Omni.from_pretrained(
    "vidore/colqwen-omni-v0.1",
    torch_dtype=torch.bfloat16,
    device_map="cuda",  # or "mps" if on Apple Silicon
    attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")

dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]


dataloader = DataLoader(
    dataset=audios,
    batch_size=2,
    shuffle=False,
    collate_fn=lambda x: processor.process_audios(x),
)

ds  = []
for batch_doc in tqdm(dataloader):
    with torch.no_grad():
        batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
        embeddings_doc = model(**batch_doc)
    ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))

def get_results(query: str, k=10):
    batch_queries = processor.process_queries([query]).to(model.device)

    # Forward pass
    with torch.no_grad():
        query_embeddings = model(**batch_queries)

    scores = processor.score_multi_vector(query_embeddings, ds)
    # get top-5 scores
    return scores[0].topk(k).indices.tolist()

res = get_results("A person looking for a taxi")

# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))
Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader

from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor

model = ColQwen2_5Omni.from_pretrained(
    "vidore/colqwen-omni-v0.1",
    torch_dtype=torch.bfloat16,
    device_map="cuda",  # or "mps" if on Apple Silicon
    attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")

dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]


dataloader = DataLoader(
    dataset=audios,
    batch_size=2,
    shuffle=False,
    collate_fn=lambda x: processor.process_audios(x),
)

ds  = []
for batch_doc in tqdm(dataloader):
    with torch.no_grad():
        batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
        embeddings_doc = model(**batch_doc)
    ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))

def get_results(query: str, k=10):
    batch_queries = processor.process_queries([query]).to(model.device)

    # Forward pass
    with torch.no_grad():
        query_embeddings = model(**batch_queries)

    scores = processor.score_multi_vector(query_embeddings, ds)
    # get top-5 scores
    return scores[0].topk(k).indices.tolist()

res = get_results("A person looking for a taxi")

# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))
Usagepythontransformers
import torch
from PIL import Image
from transformers.utils.import_utils import is_flash_attn_2_available
from tqdm import tqdm
from torch.utils.data import DataLoader

from colpali_engine.models import ColQwen2_5Omni, ColQwen2_5OmniProcessor

model = ColQwen2_5Omni.from_pretrained(
    "vidore/colqwen-omni-v0.1",
    torch_dtype=torch.bfloat16,
    device_map="cuda",  # or "mps" if on Apple Silicon
    attn_implementation="flash_attention_2" # if is_flash_attn_2_available() else None,
).eval()
processor = ColQwen2_5OmniProcessor.from_pretrained("vidore/colqwen-omni-v0.1")

dataset = load_dataset("eustlb/dailytalk-conversations-grouped", split="train[:500]")
audios = [x["array"] for x in dataset["audio"]]


dataloader = DataLoader(
    dataset=audios,
    batch_size=2,
    shuffle=False,
    collate_fn=lambda x: processor.process_audios(x),
)

ds  = []
for batch_doc in tqdm(dataloader):
    with torch.no_grad():
        batch_doc = {k: v.to(model.device) for k, v in batch_doc.items()}
        embeddings_doc = model(**batch_doc)
    ds.extend(list(torch.unbind(embeddings_doc.to("cpu"))))

def get_results(query: str, k=10):
    batch_queries = processor.process_queries([query]).to(model.device)

    # Forward pass
    with torch.no_grad():
        query_embeddings = model(**batch_queries)

    scores = processor.score_multi_vector(query_embeddings, ds)
    # get top-5 scores
    return scores[0].topk(k).indices.tolist()

res = get_results("A person looking for a taxi")

# In colab
display(Audio(dataset[res[0]]["audio"]["array"], autoplay=True, rate=dataset[res[0]]["audio"]["sampling_rate"]))

Deploy This Model

Production-ready deployment in minutes

Together.ai

Instant API access to this model

Fastest API

Production-ready inference API. Start free, scale to millions.

Try Free API

Replicate

One-click model deployment

Easiest Setup

Run models in the cloud with simple API. No DevOps required.

Deploy Now

Disclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.