Qwen2.5-VL-7B-Instruct-FP8-Dynamic
39.6K
4
7.0B
1 language
license:apache-2.0
by
RedHatAI
Image Model
OTHER
7B params
Fair
40K downloads
Community-tested
Edge AI:
Mobile
Laptop
Server
16GB+ RAM
Mobile
Laptop
Server
Quick Summary
Model Overview - Model Architecture: Qwen2.
Device Compatibility
Mobile
4-6GB RAM
Laptop
16GB RAM
Server
GPU
Minimum Recommended
7GB+ RAM
Code Examples
Load model.pythontransformers
import requests
import torch
from PIL import Image
from transformers import AutoProcessor
from llmcompressor.transformers import oneshot
from llmcompressor.transformers.tracing import (
TraceableQwen2_5_VLForConditionalGeneration,
)
from llmcompressor.modifiers.quantization import QuantizationModifier
# Load model.
model_id = Qwen/Qwen2.5-VL-7B-Instruct
model = TraceableQwen2_5_VLForConditionalGeneration.from_pretrained(
model_id, device_map="auto", torch_dtype="auto"
)
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
# Recipe
recipe = [
QuantizationModifier(
targets="Linear",
scheme="FP8_DYNAMIC",
sequential_targets=["MistralDecoderLayer"],
ignore=["re:.*lm_head", "re:vision_tower.*", "re:multi_modal_projector.*"],
),
]
SAVE_DIR=f"{model_id.split('/')[1]}-FP8-Dynamic"
# Perform oneshot
oneshot(
model=model,
recipe=recipe,
trust_remote_code_model=True,
output_dir=SAVE_DIR
)Load model.pythontransformers
import requests
import torch
from PIL import Image
from transformers import AutoProcessor
from llmcompressor.transformers import oneshot
from llmcompressor.transformers.tracing import (
TraceableQwen2_5_VLForConditionalGeneration,
)
from llmcompressor.modifiers.quantization import QuantizationModifier
# Load model.
model_id = Qwen/Qwen2.5-VL-7B-Instruct
model = TraceableQwen2_5_VLForConditionalGeneration.from_pretrained(
model_id, device_map="auto", torch_dtype="auto"
)
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
# Recipe
recipe = [
QuantizationModifier(
targets="Linear",
scheme="FP8_DYNAMIC",
sequential_targets=["MistralDecoderLayer"],
ignore=["re:.*lm_head", "re:vision_tower.*", "re:multi_modal_projector.*"],
),
]
SAVE_DIR=f"{model_id.split('/')[1]}-FP8-Dynamic"
# Perform oneshot
oneshot(
model=model,
recipe=recipe,
trust_remote_code_model=True,
output_dir=SAVE_DIR
)Load model.pythontransformers
import requests
import torch
from PIL import Image
from transformers import AutoProcessor
from llmcompressor.transformers import oneshot
from llmcompressor.transformers.tracing import (
TraceableQwen2_5_VLForConditionalGeneration,
)
from llmcompressor.modifiers.quantization import QuantizationModifier
# Load model.
model_id = Qwen/Qwen2.5-VL-7B-Instruct
model = TraceableQwen2_5_VLForConditionalGeneration.from_pretrained(
model_id, device_map="auto", torch_dtype="auto"
)
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
# Recipe
recipe = [
QuantizationModifier(
targets="Linear",
scheme="FP8_DYNAMIC",
sequential_targets=["MistralDecoderLayer"],
ignore=["re:.*lm_head", "re:vision_tower.*", "re:multi_modal_projector.*"],
),
]
SAVE_DIR=f"{model_id.split('/')[1]}-FP8-Dynamic"
# Perform oneshot
oneshot(
model=model,
recipe=recipe,
trust_remote_code_model=True,
output_dir=SAVE_DIR
)Load model.pythontransformers
import requests
import torch
from PIL import Image
from transformers import AutoProcessor
from llmcompressor.transformers import oneshot
from llmcompressor.transformers.tracing import (
TraceableQwen2_5_VLForConditionalGeneration,
)
from llmcompressor.modifiers.quantization import QuantizationModifier
# Load model.
model_id = Qwen/Qwen2.5-VL-7B-Instruct
model = TraceableQwen2_5_VLForConditionalGeneration.from_pretrained(
model_id, device_map="auto", torch_dtype="auto"
)
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
# Recipe
recipe = [
QuantizationModifier(
targets="Linear",
scheme="FP8_DYNAMIC",
sequential_targets=["MistralDecoderLayer"],
ignore=["re:.*lm_head", "re:vision_tower.*", "re:multi_modal_projector.*"],
),
]
SAVE_DIR=f"{model_id.split('/')[1]}-FP8-Dynamic"
# Perform oneshot
oneshot(
model=model,
recipe=recipe,
trust_remote_code_model=True,
output_dir=SAVE_DIR
)Load model.pythontransformers
import requests
import torch
from PIL import Image
from transformers import AutoProcessor
from llmcompressor.transformers import oneshot
from llmcompressor.transformers.tracing import (
TraceableQwen2_5_VLForConditionalGeneration,
)
from llmcompressor.modifiers.quantization import QuantizationModifier
# Load model.
model_id = Qwen/Qwen2.5-VL-7B-Instruct
model = TraceableQwen2_5_VLForConditionalGeneration.from_pretrained(
model_id, device_map="auto", torch_dtype="auto"
)
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
# Recipe
recipe = [
QuantizationModifier(
targets="Linear",
scheme="FP8_DYNAMIC",
sequential_targets=["MistralDecoderLayer"],
ignore=["re:.*lm_head", "re:vision_tower.*", "re:multi_modal_projector.*"],
),
]
SAVE_DIR=f"{model_id.split('/')[1]}-FP8-Dynamic"
# Perform oneshot
oneshot(
model=model,
recipe=recipe,
trust_remote_code_model=True,
output_dir=SAVE_DIR
)Load model.pythontransformers
import requests
import torch
from PIL import Image
from transformers import AutoProcessor
from llmcompressor.transformers import oneshot
from llmcompressor.transformers.tracing import (
TraceableQwen2_5_VLForConditionalGeneration,
)
from llmcompressor.modifiers.quantization import QuantizationModifier
# Load model.
model_id = Qwen/Qwen2.5-VL-7B-Instruct
model = TraceableQwen2_5_VLForConditionalGeneration.from_pretrained(
model_id, device_map="auto", torch_dtype="auto"
)
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
# Recipe
recipe = [
QuantizationModifier(
targets="Linear",
scheme="FP8_DYNAMIC",
sequential_targets=["MistralDecoderLayer"],
ignore=["re:.*lm_head", "re:vision_tower.*", "re:multi_modal_projector.*"],
),
]
SAVE_DIR=f"{model_id.split('/')[1]}-FP8-Dynamic"
# Perform oneshot
oneshot(
model=model,
recipe=recipe,
trust_remote_code_model=True,
output_dir=SAVE_DIR
)Load model.pythontransformers
import requests
import torch
from PIL import Image
from transformers import AutoProcessor
from llmcompressor.transformers import oneshot
from llmcompressor.transformers.tracing import (
TraceableQwen2_5_VLForConditionalGeneration,
)
from llmcompressor.modifiers.quantization import QuantizationModifier
# Load model.
model_id = Qwen/Qwen2.5-VL-7B-Instruct
model = TraceableQwen2_5_VLForConditionalGeneration.from_pretrained(
model_id, device_map="auto", torch_dtype="auto"
)
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
# Recipe
recipe = [
QuantizationModifier(
targets="Linear",
scheme="FP8_DYNAMIC",
sequential_targets=["MistralDecoderLayer"],
ignore=["re:.*lm_head", "re:vision_tower.*", "re:multi_modal_projector.*"],
),
]
SAVE_DIR=f"{model_id.split('/')[1]}-FP8-Dynamic"
# Perform oneshot
oneshot(
model=model,
recipe=recipe,
trust_remote_code_model=True,
output_dir=SAVE_DIR
)Load model.pythontransformers
import requests
import torch
from PIL import Image
from transformers import AutoProcessor
from llmcompressor.transformers import oneshot
from llmcompressor.transformers.tracing import (
TraceableQwen2_5_VLForConditionalGeneration,
)
from llmcompressor.modifiers.quantization import QuantizationModifier
# Load model.
model_id = Qwen/Qwen2.5-VL-7B-Instruct
model = TraceableQwen2_5_VLForConditionalGeneration.from_pretrained(
model_id, device_map="auto", torch_dtype="auto"
)
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
# Recipe
recipe = [
QuantizationModifier(
targets="Linear",
scheme="FP8_DYNAMIC",
sequential_targets=["MistralDecoderLayer"],
ignore=["re:.*lm_head", "re:vision_tower.*", "re:multi_modal_projector.*"],
),
]
SAVE_DIR=f"{model_id.split('/')[1]}-FP8-Dynamic"
# Perform oneshot
oneshot(
model=model,
recipe=recipe,
trust_remote_code_model=True,
output_dir=SAVE_DIR
)Vision Taskstextvllm
vllm serve neuralmagic/pixtral-12b-quantized.w8a8 --tensor_parallel_size 1 --max_model_len 25000 --trust_remote_code --max_num_seqs 8 --gpu_memory_utilization 0.9 --dtype float16 --limit_mm_per_prompt image=7
python -m eval.run eval_vllm \
--model_name neuralmagic/pixtral-12b-quantized.w8a8 \
--url http://0.0.0.0:8000 \
--output_dir ~/tmp \
--eval_name <vision_task_name>Vision Taskstextvllm
vllm serve neuralmagic/pixtral-12b-quantized.w8a8 --tensor_parallel_size 1 --max_model_len 25000 --trust_remote_code --max_num_seqs 8 --gpu_memory_utilization 0.9 --dtype float16 --limit_mm_per_prompt image=7
python -m eval.run eval_vllm \
--model_name neuralmagic/pixtral-12b-quantized.w8a8 \
--url http://0.0.0.0:8000 \
--output_dir ~/tmp \
--eval_name <vision_task_name>Vision Taskstextvllm
vllm serve neuralmagic/pixtral-12b-quantized.w8a8 --tensor_parallel_size 1 --max_model_len 25000 --trust_remote_code --max_num_seqs 8 --gpu_memory_utilization 0.9 --dtype float16 --limit_mm_per_prompt image=7
python -m eval.run eval_vllm \
--model_name neuralmagic/pixtral-12b-quantized.w8a8 \
--url http://0.0.0.0:8000 \
--output_dir ~/tmp \
--eval_name <vision_task_name>Vision Taskstextvllm
vllm serve neuralmagic/pixtral-12b-quantized.w8a8 --tensor_parallel_size 1 --max_model_len 25000 --trust_remote_code --max_num_seqs 8 --gpu_memory_utilization 0.9 --dtype float16 --limit_mm_per_prompt image=7
python -m eval.run eval_vllm \
--model_name neuralmagic/pixtral-12b-quantized.w8a8 \
--url http://0.0.0.0:8000 \
--output_dir ~/tmp \
--eval_name <vision_task_name>Vision Taskstextvllm
vllm serve neuralmagic/pixtral-12b-quantized.w8a8 --tensor_parallel_size 1 --max_model_len 25000 --trust_remote_code --max_num_seqs 8 --gpu_memory_utilization 0.9 --dtype float16 --limit_mm_per_prompt image=7
python -m eval.run eval_vllm \
--model_name neuralmagic/pixtral-12b-quantized.w8a8 \
--url http://0.0.0.0:8000 \
--output_dir ~/tmp \
--eval_name <vision_task_name>Vision Taskstextvllm
vllm serve neuralmagic/pixtral-12b-quantized.w8a8 --tensor_parallel_size 1 --max_model_len 25000 --trust_remote_code --max_num_seqs 8 --gpu_memory_utilization 0.9 --dtype float16 --limit_mm_per_prompt image=7
python -m eval.run eval_vllm \
--model_name neuralmagic/pixtral-12b-quantized.w8a8 \
--url http://0.0.0.0:8000 \
--output_dir ~/tmp \
--eval_name <vision_task_name>Vision Taskstextvllm
vllm serve neuralmagic/pixtral-12b-quantized.w8a8 --tensor_parallel_size 1 --max_model_len 25000 --trust_remote_code --max_num_seqs 8 --gpu_memory_utilization 0.9 --dtype float16 --limit_mm_per_prompt image=7
python -m eval.run eval_vllm \
--model_name neuralmagic/pixtral-12b-quantized.w8a8 \
--url http://0.0.0.0:8000 \
--output_dir ~/tmp \
--eval_name <vision_task_name>Vision Taskstextvllm
vllm serve neuralmagic/pixtral-12b-quantized.w8a8 --tensor_parallel_size 1 --max_model_len 25000 --trust_remote_code --max_num_seqs 8 --gpu_memory_utilization 0.9 --dtype float16 --limit_mm_per_prompt image=7
python -m eval.run eval_vllm \
--model_name neuralmagic/pixtral-12b-quantized.w8a8 \
--url http://0.0.0.0:8000 \
--output_dir ~/tmp \
--eval_name <vision_task_name>Text-based Taskstextvllm
lm_eval \
--model vllm \
--model_args pretrained="<model_name>",dtype=auto,add_bos_token=True,max_model_len=4096,tensor_parallel_size=<n>,gpu_memory_utilization=0.8,enable_chunked_prefill=True,trust_remote_code=True \
--tasks mmlu \
--num_fewshot 5 \
--batch_size auto \
--output_path output_dirText-based Taskstextvllm
lm_eval \
--model vllm \
--model_args pretrained="<model_name>",dtype=auto,add_bos_token=True,max_model_len=4096,tensor_parallel_size=<n>,gpu_memory_utilization=0.8,enable_chunked_prefill=True,trust_remote_code=True \
--tasks mmlu \
--num_fewshot 5 \
--batch_size auto \
--output_path output_dirText-based Taskstextvllm
lm_eval \
--model vllm \
--model_args pretrained="<model_name>",dtype=auto,add_bos_token=True,max_model_len=4096,tensor_parallel_size=<n>,gpu_memory_utilization=0.8,enable_chunked_prefill=True,trust_remote_code=True \
--tasks mmlu \
--num_fewshot 5 \
--batch_size auto \
--output_path output_dirText-based Taskstextvllm
lm_eval \
--model vllm \
--model_args pretrained="<model_name>",dtype=auto,add_bos_token=True,max_model_len=4096,tensor_parallel_size=<n>,gpu_memory_utilization=0.8,enable_chunked_prefill=True,trust_remote_code=True \
--tasks mmlu \
--num_fewshot 5 \
--batch_size auto \
--output_path output_dirText-based Taskstextvllm
lm_eval \
--model vllm \
--model_args pretrained="<model_name>",dtype=auto,add_bos_token=True,max_model_len=4096,tensor_parallel_size=<n>,gpu_memory_utilization=0.8,enable_chunked_prefill=True,trust_remote_code=True \
--tasks mmlu \
--num_fewshot 5 \
--batch_size auto \
--output_path output_dirText-based Taskstextvllm
lm_eval \
--model vllm \
--model_args pretrained="<model_name>",dtype=auto,add_bos_token=True,max_model_len=4096,tensor_parallel_size=<n>,gpu_memory_utilization=0.8,enable_chunked_prefill=True,trust_remote_code=True \
--tasks mmlu \
--num_fewshot 5 \
--batch_size auto \
--output_path output_dirText-based Taskstextvllm
lm_eval \
--model vllm \
--model_args pretrained="<model_name>",dtype=auto,add_bos_token=True,max_model_len=4096,tensor_parallel_size=<n>,gpu_memory_utilization=0.8,enable_chunked_prefill=True,trust_remote_code=True \
--tasks mmlu \
--num_fewshot 5 \
--batch_size auto \
--output_path output_dirText-based Taskstextvllm
lm_eval \
--model vllm \
--model_args pretrained="<model_name>",dtype=auto,add_bos_token=True,max_model_len=4096,tensor_parallel_size=<n>,gpu_memory_utilization=0.8,enable_chunked_prefill=True,trust_remote_code=True \
--tasks mmlu \
--num_fewshot 5 \
--batch_size auto \
--output_path output_dirDeploy This Model
Production-ready deployment in minutes
Together.ai
Instant API access to this model
Production-ready inference API. Start free, scale to millions.
Try Free APIReplicate
One-click model deployment
Run models in the cloud with simple API. No DevOps required.
Deploy NowDisclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.