Voxtral-Small-24B-2507
8.4K
440
24.0B
8 languages
license:apache-2.0
by
mistralai
Audio Model
OTHER
24B params
New
8K downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
54GB+ RAM
Mobile
Laptop
Server
Quick Summary
Voxtral Small is an enhancement of Mistral Small 3, incorporating state-of-the-art audio input capabilities while retaining best-in-class text performance.
Device Compatibility
Mobile
4-6GB RAM
Laptop
16GB RAM
Server
GPU
Minimum Recommended
23GB+ RAM
Code Examples
vLLM (recommended)textvllm
uv pip install -U "vllm[audio]" --systemtext
python -c "import mistral_common; print(mistral_common.__version__)"Offlinebashvllm
git clone https://github.com/vllm-project/vllm && cd vllmOfflinebash
python examples/offline_inference/audio_language.py --num-audios 2 --model-type voxtralAudio Instructbash
pip install --upgrade mistral_common\[audio\]Modify OpenAI's API key and API base to use vLLM's API server.pythonvllm
from mistral_common.protocol.instruct.messages import TextChunk, AudioChunk, UserMessage, AssistantMessage, RawAudio
from mistral_common.audio import Audio
from huggingface_hub import hf_hub_download
from openai import OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://<your-server-host>:8000/v1"
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
obama_file = hf_hub_download("patrickvonplaten/audio_samples", "obama.mp3", repo_type="dataset")
bcn_file = hf_hub_download("patrickvonplaten/audio_samples", "bcn_weather.mp3", repo_type="dataset")
def file_to_chunk(file: str) -> AudioChunk:
audio = Audio.from_file(file, strict=False)
return AudioChunk.from_audio(audio)
text_chunk = TextChunk(text="Which speaker is more inspiring? Why? How are they different from each other? Answer in French.")
user_msg = UserMessage(content=[file_to_chunk(obama_file), file_to_chunk(bcn_file), text_chunk]).to_openai()
print(30 * "=" + "USER 1" + 30 * "=")
print(text_chunk.text)
print("\n\n")
response = client.chat.completions.create(
model=model,
messages=[user_msg],
temperature=0.2,
top_p=0.95,
)
content = response.choices[0].message.content
print(30 * "=" + "BOT 1" + 30 * "=")
print(content)
print("\n\n")
# The model could give the following answer:
#Modify OpenAI's API key and API base to use vLLM's API server.pythonvllm
from mistral_common.protocol.transcription.request import TranscriptionRequest
from mistral_common.protocol.instruct.messages import RawAudio
from mistral_common.audio import Audio
from huggingface_hub import hf_hub_download
from openai import OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://<your-server-host>:8000/v1"
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
obama_file = hf_hub_download("patrickvonplaten/audio_samples", "obama.mp3", repo_type="dataset")
audio = Audio.from_file(obama_file, strict=False)
audio = RawAudio.from_audio(audio)
req = TranscriptionRequest(model=model, audio=audio, language="en", temperature=0.0).to_openai(exclude=("top_p", "seed"))
response = client.audio.transcriptions.create(**req)
print(response)Modify OpenAI's API key and API base to use vLLM's API server.pythonvllm
from mistral_common.protocol.instruct.messages import AudioChunk, UserMessage, TextChunk
from mistral_common.protocol.transcription.request import TranscriptionRequest
from mistral_common.protocol.instruct.tool_calls import Function, Tool
from mistral_common.audio import Audio
from huggingface_hub import hf_hub_download
from openai import OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://<your-server-host>:8000/v1"
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
tool = Tool(
function=Function(
name="get_current_weather",
description="Get the current weather",
parameters={
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"format": {
"type": "string",
"enum": ["celsius", "fahrenheit"],
"description": "The temperature unit to use. Infer this from the user's location.",
},
},
"required": ["location", "format"],
},
)
)
tools = [tool.to_openai()]
weather_like = hf_hub_download("patrickvonplaten/audio_samples", "fn_calling.wav", repo_type="dataset")
def file_to_chunk(file: str) -> AudioChunk:
audio = Audio.from_file(file, strict=False)
return AudioChunk.from_audio(audio)
audio_chunk = file_to_chunk(weather_like)
print(30 * "=" + "Transcription" + 30 * "=")
req = TranscriptionRequest(model=model, audio=audio_chunk.input_audio, language="en", temperature=0.0).to_openai(exclude=("top_p", "seed"))
response = client.audio.transcriptions.create(**req)
print(response.text) # How is the weather in Madrid at the moment?
print("\n")
print(30 * "=" + "Function calling" + 30 * "=")
audio_chunk = file_to_chunk(weather_like)
user_msg = UserMessage(content=[audio_chunk]).to_openai()
response = client.chat.completions.create(
model=model,
messages=[user_msg],
temperature=0.2,
top_p=0.95,
tools=[tool.to_openai()]
)
print(30 * "=" + "BOT 1" + 30 * "=")
print(response.choices[0].message.tool_calls)
print("\n\n")Transformers đ€bash
pip install -U transformersbash
pip install --upgrade "mistral-common[audio]"Deploy This Model
Production-ready deployment in minutes
Together.ai
Instant API access to this model
Production-ready inference API. Start free, scale to millions.
Try Free APIReplicate
One-click model deployment
Run models in the cloud with simple API. No DevOps required.
Deploy NowDisclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.