Qwen3-VL-Embedding-2B
1.6M
379
license:apache-2.0
by
Qwen
Embedding Model
OTHER
2B params
High
1.6M downloads
Battle-tested
Edge AI:
Mobile
Laptop
Server
5GB+ RAM
Mobile
Laptop
Server
Quick Summary
AI model with specialized capabilities.
Device Compatibility
Mobile
4-6GB RAM
Laptop
16GB RAM
Server
GPU
Minimum Recommended
2GB+ RAM
Code Examples
Usagepython
from sentence_transformers import SentenceTransformer
# Load the model
model = SentenceTransformer("Qwen/Qwen3-VL-Embedding-2B")
# Text queries
queries = [
"A woman playing with her dog on a beach at sunset.",
"Pet owner training dog outdoors near water.",
"Woman surfing on waves during a sunny day.",
"City skyline view from a high-rise building at night.",
]
# Documents: text, image, and text+image
documents = [
"A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, as the dog offers its paw in a heartwarming display of companionship and trust.",
"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
{"text": "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, as the dog offers its paw in a heartwarming display of companionship and trust.", "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"},
]
# Encode queries and documents
query_embeddings = model.encode(queries)
doc_embeddings = model.encode(documents)
print(query_embeddings.shape, doc_embeddings.shape)
# (4, 2048) (3, 2048)
# Compute similarities
similarities = model.similarity(query_embeddings, doc_embeddings)
print(similarities)
# tensor([[0.8160, 0.7155, 0.7054],
# [0.5173, 0.3295, 0.4446],
# [0.3863, 0.2987, 0.3312],
# [0.1061, 0.0433, 0.0839]])Using transformerspythonpytorch
from scripts.qwen3_vl_embedding import Qwen3VLEmbedder
import numpy as np
import torch
# Define a list of query texts
queries = [
{"text": "A woman playing with her dog on a beach at sunset."},
{"text": "Pet owner training dog outdoors near water."},
{"text": "Woman surfing on waves during a sunny day."},
{"text": "City skyline view from a high-rise building at night."}
]
# Define a list of document texts and images
documents = [
{"text": "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, as the dog offers its paw in a heartwarming display of companionship and trust."},
{"image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"},
{"text": "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, as the dog offers its paw in a heartwarming display of companionship and trust.", "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"}
]
# Specify the model path
model_name_or_path = "Qwen/Qwen3-VL-Embedding-2B"
# Initialize the Qwen3VLEmbedder model
model = Qwen3VLEmbedder(model_name_or_path=model_name_or_path)
# We recommend enabling flash_attention_2 for better acceleration and memory saving,
# model = Qwen3VLEmbedder(model_name_or_path=model_name_or_path, torch_dtype=torch.float16, attn_implementation="flash_attention_2")
# Combine queries and documents into a single input list
inputs = queries + documents
# Process the inputs to get embeddings
embeddings = model.process(inputs)
# Compute similarity scores between query embeddings and document embeddings
similarity_scores = (embeddings[:4] @ embeddings[4:].T)
# Print out the similarity scores in a list format
print(similarity_scores.tolist())
# [[0.8157786130905151, 0.7178360223770142, 0.7173429131507874], [0.5195091962814331, 0.3302568793296814, 0.4391537308692932], [0.3884059488773346, 0.285782128572464, 0.33141762018203735], [0.1092604324221611, 0.03871120512485504, 0.06952016055583954]]vLLM Basic Usage Examplepythonvllm
import argparse
import numpy as np
import os
from typing import List, Dict, Any
from vllm import LLM, EngineArgs
from vllm.multimodal.utils import fetch_image
# Define a list of query texts
queries = [
{"text": "A woman playing with her dog on a beach at sunset."},
{"text": "Pet owner training dog outdoors near water."},
{"text": "Woman surfing on waves during a sunny day."},
{"text": "City skyline view from a high-rise building at night."}
]
# Define a list of document texts and images
documents = [
{"text": "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, as the dog offers its paw in a heartwarming display of companionship and trust."},
{"image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"},
{"text": "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, as the dog offers its paw in a heartwarming display of companionship and trust.", "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"}
]
def format_input_to_conversation(input_dict: Dict[str, Any], instruction: str = "Represent the user's input.") -> List[Dict]:
content = []
text = input_dict.get('text')
image = input_dict.get('image')
if image:
image_content = None
if isinstance(image, str):
if image.startswith(('http', 'https', 'oss')):
image_content = image
else:
abs_image_path = os.path.abspath(image)
image_content = 'file://' + abs_image_path
else:
image_content = image
if image_content:
content.append({
'type': 'image',
'image': image_content,
})
if text:
content.append({'type': 'text', 'text': text})
if not content:
content.append({'type': 'text', 'text': ""})
conversation = [
{"role": "system", "content": [{"type": "text", "text": instruction}]},
{"role": "user", "content": content}
]
return conversation
def prepare_vllm_inputs(input_dict: Dict[str, Any], llm, instruction: str = "Represent the user's input.") -> Dict[str, Any]:
text = input_dict.get('text')
image = input_dict.get('image')
conversation = format_input_to_conversation(input_dict, instruction)
prompt_text = llm.llm_engine.tokenizer.apply_chat_template(
conversation,
tokenize=False,
add_generation_prompt=True
)
multi_modal_data = None
if image:
if isinstance(image, str):
if image.startswith(('http', 'https', 'oss')):
try:
image_obj = fetch_image(image)
multi_modal_data = {"image": image_obj}
except Exception as e:
print(f"Warning: Failed to fetch image {image}: {e}")
else:
abs_image_path = os.path.abspath(image)
if os.path.exists(abs_image_path):
from PIL import Image
image_obj = Image.open(abs_image_path)
multi_modal_data = {"image": image_obj}
else:
print(f"Warning: Image file not found: {abs_image_path}")
else:
multi_modal_data = {"image": image}
result = {
"prompt": prompt_text,
"multi_modal_data": multi_modal_data
}
return result
def main():
parser = argparse.ArgumentParser(description="Offline Similarity Check with vLLM")
parser.add_argument("--model-path", type=str, default="models/Qwen3-VL-Embedding-2B", help="Path to the model")
parser.add_argument("--dtype", type=str, default="bfloat16", help="Data type (e.g., bfloat16)")
args = parser.parse_args()
print(f"Loading model from {args.model_path}...")
engine_args = EngineArgs(
model=args.model_path,
runner="pooling",
dtype=args.dtype,
trust_remote_code=True,
)
llm = LLM(**vars(engine_args))
all_inputs = queries + documents
vllm_inputs = [prepare_vllm_inputs(inp, llm) for inp in all_inputs]
outputs = llm.embed(vllm_inputs)
embeddings_list = []
for i, output in enumerate(outputs):
emb = output.outputs.embedding
embeddings_list.append(emb)
print(f"Input {i} embedding shape: {len(emb)}")
embeddings = np.array(embeddings_list)
print(f"\nEmbeddings shape: {embeddings.shape}")
num_queries = len(queries)
query_embeddings = embeddings[:num_queries]
doc_embeddings = embeddings[num_queries:]
similarity_scores = query_embeddings @ doc_embeddings.T
print("\nSimilarity Scores:")
print(similarity_scores.tolist())
if __name__ == "__main__":
main()SGLang Basic Usage Examplepythonpytorch
import argparse
import numpy as np
import torch
import os
from typing import List, Dict, Any
from sglang.srt.entrypoints.engine import Engine
# Define a list of query texts
queries = [
{"text": "A woman playing with her dog on a beach at sunset."},
{"text": "Pet owner training dog outdoors near water."},
{"text": "Woman surfing on waves during a sunny day."},
{"text": "City skyline view from a high-rise building at night."}
]
# Define a list of document texts and images
documents = [
{"text": "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, as the dog offers its paw in a heartwarming display of companionship and trust."},
{"image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"},
{"text": "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, as the dog offers its paw in a heartwarming display of companionship and trust.", "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"}
]
def format_input_to_conversation(input_dict: Dict[str, Any], instruction: str = "Represent the user's input.") -> List[Dict]:
content = []
text = input_dict.get('text')
image = input_dict.get('image')
if image:
image_content = None
if isinstance(image, str):
if image.startswith(('http', 'oss')):
image_content = image
else:
abs_image_path = os.path.abspath(image)
image_content = 'file://' + abs_image_path
else:
image_content = image
if image_content:
content.append({
'type': 'image', 'image': image_content,
})
if text:
content.append({'type': 'text', 'text': text})
if not content:
content.append({'type': 'text', 'text': ""})
conversation = [
{"role": "system", "content": [{"type": "text", "text": instruction}]},
{"role": "user", "content": content}
]
return conversation
def convert_to_sglang_format(input_dict: Dict[str, Any], engine: Engine, instruction: str = "Represent the user's input.") -> Dict[str, Any]:
conversation = format_input_to_conversation(input_dict, instruction)
text_for_api = engine.tokenizer_manager.tokenizer.apply_chat_template(
conversation,
tokenize=False,
add_generation_prompt=True
)
result = {"text": text_for_api}
image = input_dict.get('image')
if image and isinstance(image, str):
result["image"] = image
return result
def main():
parser = argparse.ArgumentParser(description="Offline Similarity Check with SGLang")
parser.add_argument("--model-path", type=str, default="models/Qwen3-VL-Embedding-2B", help="Path to the model")
parser.add_argument("--dtype", type=str, default="bfloat16", help="Data type (e.g., bfloat16)")
args = parser.parse_args()
print(f"Loading model from {args.model_path}...")
engine = Engine(
model_path=args.model_path,
is_embedding=True,
dtype=args.dtype,
trust_remote_code=True,
)
inputs = queries + documents
sglang_inputs = [convert_to_sglang_format(inp, engine) for inp in inputs]
print(sglang_inputs[:])
print(f"sglang_inputs: {sglang_inputs}")
print(f"Processing {len(sglang_inputs)} inputs...")
prompts = [inp['text'] for inp in sglang_inputs]
images = [inp.get('image') for inp in sglang_inputs]
results = engine.encode(prompts, image_data=images)
embeddings_list = []
for res in results:
embeddings_list.append(res['embedding'])
embeddings = np.array(embeddings_list)
print(f"Embeddings shape: {embeddings.shape}")
num_queries = len(queries)
query_embeddings = embeddings[:num_queries]
doc_embeddings = embeddings[num_queries:]
similarity_scores = (query_embeddings @ doc_embeddings.T)
print("\nSimilarity Scores:")
print(similarity_scores.tolist())
if __name__ == "__main__":
main()Deploy This Model
Production-ready deployment in minutes
Together.ai
Instant API access to this model
Production-ready inference API. Start free, scale to millions.
Try Free APIReplicate
One-click model deployment
Run models in the cloud with simple API. No DevOps required.
Deploy NowDisclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.