MERaLiON-SER-v1

814
2
7 languages
by
MERaLiON
Embedding Model
OTHER
New
814 downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
Unknown
Mobile
Laptop
Server
Quick Summary

AI model with specialized capabilities.

Code Examples

⚙️ Usage Examplespythontransformers
from transformers import AutoProcessor, AutoModelForAudioClassification
import torch, torchaudio

repo = "MERaLiON/MERaLiON-SER-v1"
device = "cuda" if torch.cuda.is_available() else "cpu"

processor = AutoProcessor.from_pretrained(repo)
model = AutoModelForAudioClassification.from_pretrained(repo, trust_remote_code=True).to(device)
model.eval()

# ------- Single wav exmple --------------
wav, sr = torchaudio.load("sample.wav")
if wav.shape[0] > 1: wav = wav.mean(dim=0, keepdim=True)
wav = torchaudio.transforms.Resample(sr, 16000)(wav)

inputs = processor(wav.squeeze().numpy(), sampling_rate=16000, return_tensors="pt", return_attention_mask=True)
with torch.inference_mode():
    out = model(**{k:v.to(device) for k,v in inputs.items() if k in ("input_features","attention_mask")})
logits, dims = out["logits"], out["dims"]

emo_idx = torch.argmax(logits, dim=1).item()
emo_map = ["Neutral","Happy","Sad","Angry","Fearful","Disgusted","Surprised"]
print("Predicted Emotion:", emo_map[emo_idx])
print("Valance/Arousal/Dominance:", dims.squeeze().tolist())

# -------- Batch inference example using above loaded wav file--------
wav = wav.squeeze().numpy() # tensor of size (41642,) --> (samples,)
wavs = [wav,wav,wav] # example list of wav (tensor) for batch, here batch size =3

batch_inputs = processor(
    wavs,                     # list of 1D numpy arrays
    sampling_rate=16000,
    return_tensors="pt",
     padding="max_length",
    return_attention_mask=True,
)
with torch.inference_mode():
    out = model(**{k: v.to(device) for k, v in batch_inputs.items() if k in ("input_features","attention_mask")})

logits, dims = out["logits"], out["dims"]  # logits: (B, 7), dims: (B, 3) where B is batch size
emo_ids = torch.argmax(logits, dim=1).tolist()

for i in range(len(wavs)):
    eid = emo_ids[i]
    vad = dims.tolist()[i]
    print(f"Batch Index {i} -> {emo_map[eid]} | VAD={vad}")
🔹 CPU Inferencepythontransformers
from transformers import AutoProcessor, AutoModelForAudioClassification
import torch, soundfile as sf, torchaudio

repo = "MERaLiON/MERaLiON-SER-v1"
processor = AutoProcessor.from_pretrained(repo)
model = AutoModelForAudioClassification.from_pretrained(repo, trust_remote_code=True).cpu().eval()

wav, sr = sf.read("sample.wav")
if wav.ndim > 1: wav = wav.mean(axis=1)
if sr != 16000:
    wav = torchaudio.functional.resample(torch.tensor(wav).unsqueeze(0), sr, 16000).squeeze(0).numpy()

inputs = processor(wav, sampling_rate=16000, return_tensors="pt")
with torch.inference_mode():
    out = model(**inputs)
logits, dims = out["logits"], out["dims"]
emo_idx = torch.argmax(logits, dim=1).item()
emo_map = ["Neutral","Happy","Sad","Angry","Fearful","Disgusted","Surprised"]
print("Predicted Emotion:", emo_map[emo_idx])
print("Valance/Arousal/Dominance:", dims.squeeze().tolist())

Deploy This Model

Production-ready deployment in minutes

Together.ai

Instant API access to this model

Fastest API

Production-ready inference API. Start free, scale to millions.

Try Free API

Replicate

One-click model deployment

Easiest Setup

Run models in the cloud with simple API. No DevOps required.

Deploy Now

Disclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.