MuFun Base

Name: MuFun Base
Author: Yi3852

2 languages

license:apache-2.0

Yi3852

Audio Model

OTHER

2508.01178B params

New

16 downloads

Early-stage

Try on Hugging Face Add to Compare

Edge AI:

Mobile

Laptop

Server

5606GB+ RAM

Mobile

Laptop

Server

Quick Summary

MuFun model proposed in Advancing the Foundation Model for Music Understanding Usage some audio processing packages like mutagen, torchaudio are needed to be i...

Device Compatibility

Mobile

4-6GB RAM

Laptop

16GB RAM

Server

GPU

Minimum Recommended

2336GB+ RAM

Code Examples

Usagepythontransformers

from transformers import AutoTokenizer, AutoModelForCausalLM
hf_path = 'Yi3852/MuFun-Base'
tokenizer = AutoTokenizer.from_pretrained(hf_path, use_fast=False)
device='cuda'
model = AutoModelForCausalLM.from_pretrained(hf_path, trust_remote_code=True, torch_dtype="bfloat16")
model.to(device)

# single audio
# during inference the audio(converted to a sequence of embeddings) will be placed in the position of <audio> tag in the prompt
aud="/path/to/your/song.mp3"
inp="\n<audio>Can you listen to this song and tell me its lyrics?" 
res=model.chat(prompt=inp, audio_files=aud, tokenizer=tokenizer)
print(res)

# multiple audios
# for multiple songs each will be placed in the coresponding <audio> tag in the prompt
aud=["/path/to/your/song1.mp3", '/path/to/your/song2.mp3']
inp="\n<audio> This is song1. <audio> This is song2. Which song do you like more? Tell me the reason."
res=model.chat(prompt=inp, audio_files=aud, tokenizer=tokenizer)
print(res)

# analyze only a specific segment of audio using the segs parameter
# format is [start_time, end_time](in seconds), for multiple audios segs can be passed like [[0,30],[60,90]], [None,[0,30.0]]
aud="/path/to/your/song.mp3"
inp="\n<audio>How is the rhythm of this music clip?"
res=model.chat(prompt=inp, audio_files=aud, segs=[0,30.0], tokenizer=tokenizer)
print(res)

# set audio_files=None will work, however it is not recommended to use it as a text model

Usagepythontransformers

from transformers import AutoTokenizer, AutoModelForCausalLM
hf_path = 'Yi3852/MuFun-Base'
tokenizer = AutoTokenizer.from_pretrained(hf_path, use_fast=False)
device='cuda'
model = AutoModelForCausalLM.from_pretrained(hf_path, trust_remote_code=True, torch_dtype="bfloat16")
model.to(device)

# single audio
# during inference the audio(converted to a sequence of embeddings) will be placed in the position of <audio> tag in the prompt
aud="/path/to/your/song.mp3"
inp="\n<audio>Can you listen to this song and tell me its lyrics?" 
res=model.chat(prompt=inp, audio_files=aud, tokenizer=tokenizer)
print(res)

# multiple audios
# for multiple songs each will be placed in the coresponding <audio> tag in the prompt
aud=["/path/to/your/song1.mp3", '/path/to/your/song2.mp3']
inp="\n<audio> This is song1. <audio> This is song2. Which song do you like more? Tell me the reason."
res=model.chat(prompt=inp, audio_files=aud, tokenizer=tokenizer)
print(res)

# analyze only a specific segment of audio using the segs parameter
# format is [start_time, end_time](in seconds), for multiple audios segs can be passed like [[0,30],[60,90]], [None,[0,30.0]]
aud="/path/to/your/song.mp3"
inp="\n<audio>How is the rhythm of this music clip?"
res=model.chat(prompt=inp, audio_files=aud, segs=[0,30.0], tokenizer=tokenizer)
print(res)

# set audio_files=None will work, however it is not recommended to use it as a text model

Usagepythontransformers

from transformers import AutoTokenizer, AutoModelForCausalLM
hf_path = 'Yi3852/MuFun-Base'
tokenizer = AutoTokenizer.from_pretrained(hf_path, use_fast=False)
device='cuda'
model = AutoModelForCausalLM.from_pretrained(hf_path, trust_remote_code=True, torch_dtype="bfloat16")
model.to(device)

# single audio
# during inference the audio(converted to a sequence of embeddings) will be placed in the position of <audio> tag in the prompt
aud="/path/to/your/song.mp3"
inp="\n<audio>Can you listen to this song and tell me its lyrics?" 
res=model.chat(prompt=inp, audio_files=aud, tokenizer=tokenizer)
print(res)

# multiple audios
# for multiple songs each will be placed in the coresponding <audio> tag in the prompt
aud=["/path/to/your/song1.mp3", '/path/to/your/song2.mp3']
inp="\n<audio> This is song1. <audio> This is song2. Which song do you like more? Tell me the reason."
res=model.chat(prompt=inp, audio_files=aud, tokenizer=tokenizer)
print(res)

# analyze only a specific segment of audio using the segs parameter
# format is [start_time, end_time](in seconds), for multiple audios segs can be passed like [[0,30],[60,90]], [None,[0,30.0]]
aud="/path/to/your/song.mp3"
inp="\n<audio>How is the rhythm of this music clip?"
res=model.chat(prompt=inp, audio_files=aud, segs=[0,30.0], tokenizer=tokenizer)
print(res)

# set audio_files=None will work, however it is not recommended to use it as a text model

Usagepythontransformers

from transformers import AutoTokenizer, AutoModelForCausalLM
hf_path = 'Yi3852/MuFun-Base'
tokenizer = AutoTokenizer.from_pretrained(hf_path, use_fast=False)
device='cuda'
model = AutoModelForCausalLM.from_pretrained(hf_path, trust_remote_code=True, torch_dtype="bfloat16")
model.to(device)

# single audio
# during inference the audio(converted to a sequence of embeddings) will be placed in the position of <audio> tag in the prompt
aud="/path/to/your/song.mp3"
inp="\n<audio>Can you listen to this song and tell me its lyrics?" 
res=model.chat(prompt=inp, audio_files=aud, tokenizer=tokenizer)
print(res)

# multiple audios
# for multiple songs each will be placed in the coresponding <audio> tag in the prompt
aud=["/path/to/your/song1.mp3", '/path/to/your/song2.mp3']
inp="\n<audio> This is song1. <audio> This is song2. Which song do you like more? Tell me the reason."
res=model.chat(prompt=inp, audio_files=aud, tokenizer=tokenizer)
print(res)

# analyze only a specific segment of audio using the segs parameter
# format is [start_time, end_time](in seconds), for multiple audios segs can be passed like [[0,30],[60,90]], [None,[0,30.0]]
aud="/path/to/your/song.mp3"
inp="\n<audio>How is the rhythm of this music clip?"
res=model.chat(prompt=inp, audio_files=aud, segs=[0,30.0], tokenizer=tokenizer)
print(res)

# set audio_files=None will work, however it is not recommended to use it as a text model

Usagepythontransformers

from transformers import AutoTokenizer, AutoModelForCausalLM
hf_path = 'Yi3852/MuFun-Base'
tokenizer = AutoTokenizer.from_pretrained(hf_path, use_fast=False)
device='cuda'
model = AutoModelForCausalLM.from_pretrained(hf_path, trust_remote_code=True, torch_dtype="bfloat16")
model.to(device)

# single audio
# during inference the audio(converted to a sequence of embeddings) will be placed in the position of <audio> tag in the prompt
aud="/path/to/your/song.mp3"
inp="\n<audio>Can you listen to this song and tell me its lyrics?" 
res=model.chat(prompt=inp, audio_files=aud, tokenizer=tokenizer)
print(res)

# multiple audios
# for multiple songs each will be placed in the coresponding <audio> tag in the prompt
aud=["/path/to/your/song1.mp3", '/path/to/your/song2.mp3']
inp="\n<audio> This is song1. <audio> This is song2. Which song do you like more? Tell me the reason."
res=model.chat(prompt=inp, audio_files=aud, tokenizer=tokenizer)
print(res)

# analyze only a specific segment of audio using the segs parameter
# format is [start_time, end_time](in seconds), for multiple audios segs can be passed like [[0,30],[60,90]], [None,[0,30.0]]
aud="/path/to/your/song.mp3"
inp="\n<audio>How is the rhythm of this music clip?"
res=model.chat(prompt=inp, audio_files=aud, segs=[0,30.0], tokenizer=tokenizer)
print(res)

# set audio_files=None will work, however it is not recommended to use it as a text model

Usagepythontransformers

from transformers import AutoTokenizer, AutoModelForCausalLM
hf_path = 'Yi3852/MuFun-Base'
tokenizer = AutoTokenizer.from_pretrained(hf_path, use_fast=False)
device='cuda'
model = AutoModelForCausalLM.from_pretrained(hf_path, trust_remote_code=True, torch_dtype="bfloat16")
model.to(device)

# single audio
# during inference the audio(converted to a sequence of embeddings) will be placed in the position of <audio> tag in the prompt
aud="/path/to/your/song.mp3"
inp="\n<audio>Can you listen to this song and tell me its lyrics?" 
res=model.chat(prompt=inp, audio_files=aud, tokenizer=tokenizer)
print(res)

# multiple audios
# for multiple songs each will be placed in the coresponding <audio> tag in the prompt
aud=["/path/to/your/song1.mp3", '/path/to/your/song2.mp3']
inp="\n<audio> This is song1. <audio> This is song2. Which song do you like more? Tell me the reason."
res=model.chat(prompt=inp, audio_files=aud, tokenizer=tokenizer)
print(res)

# analyze only a specific segment of audio using the segs parameter
# format is [start_time, end_time](in seconds), for multiple audios segs can be passed like [[0,30],[60,90]], [None,[0,30.0]]
aud="/path/to/your/song.mp3"
inp="\n<audio>How is the rhythm of this music clip?"
res=model.chat(prompt=inp, audio_files=aud, segs=[0,30.0], tokenizer=tokenizer)
print(res)

# set audio_files=None will work, however it is not recommended to use it as a text model

Usagepythontransformers

from transformers import AutoTokenizer, AutoModelForCausalLM
hf_path = 'Yi3852/MuFun-Base'
tokenizer = AutoTokenizer.from_pretrained(hf_path, use_fast=False)
device='cuda'
model = AutoModelForCausalLM.from_pretrained(hf_path, trust_remote_code=True, torch_dtype="bfloat16")
model.to(device)

# single audio
# during inference the audio(converted to a sequence of embeddings) will be placed in the position of <audio> tag in the prompt
aud="/path/to/your/song.mp3"
inp="\n<audio>Can you listen to this song and tell me its lyrics?" 
res=model.chat(prompt=inp, audio_files=aud, tokenizer=tokenizer)
print(res)

# multiple audios
# for multiple songs each will be placed in the coresponding <audio> tag in the prompt
aud=["/path/to/your/song1.mp3", '/path/to/your/song2.mp3']
inp="\n<audio> This is song1. <audio> This is song2. Which song do you like more? Tell me the reason."
res=model.chat(prompt=inp, audio_files=aud, tokenizer=tokenizer)
print(res)

# analyze only a specific segment of audio using the segs parameter
# format is [start_time, end_time](in seconds), for multiple audios segs can be passed like [[0,30],[60,90]], [None,[0,30.0]]
aud="/path/to/your/song.mp3"
inp="\n<audio>How is the rhythm of this music clip?"
res=model.chat(prompt=inp, audio_files=aud, segs=[0,30.0], tokenizer=tokenizer)
print(res)

# set audio_files=None will work, however it is not recommended to use it as a text model

Usagepythontransformers

from transformers import AutoTokenizer, AutoModelForCausalLM
hf_path = 'Yi3852/MuFun-Base'
tokenizer = AutoTokenizer.from_pretrained(hf_path, use_fast=False)
device='cuda'
model = AutoModelForCausalLM.from_pretrained(hf_path, trust_remote_code=True, torch_dtype="bfloat16")
model.to(device)

# single audio
# during inference the audio(converted to a sequence of embeddings) will be placed in the position of <audio> tag in the prompt
aud="/path/to/your/song.mp3"
inp="\n<audio>Can you listen to this song and tell me its lyrics?" 
res=model.chat(prompt=inp, audio_files=aud, tokenizer=tokenizer)
print(res)

# multiple audios
# for multiple songs each will be placed in the coresponding <audio> tag in the prompt
aud=["/path/to/your/song1.mp3", '/path/to/your/song2.mp3']
inp="\n<audio> This is song1. <audio> This is song2. Which song do you like more? Tell me the reason."
res=model.chat(prompt=inp, audio_files=aud, tokenizer=tokenizer)
print(res)

# analyze only a specific segment of audio using the segs parameter
# format is [start_time, end_time](in seconds), for multiple audios segs can be passed like [[0,30],[60,90]], [None,[0,30.0]]
aud="/path/to/your/song.mp3"
inp="\n<audio>How is the rhythm of this music clip?"
res=model.chat(prompt=inp, audio_files=aud, segs=[0,30.0], tokenizer=tokenizer)
print(res)

# set audio_files=None will work, however it is not recommended to use it as a text model

Deploy This Model

Production-ready deployment in minutes

Together.ai

Instant API access to this model

Fastest API

Production-ready inference API. Start free, scale to millions.

Try Free API

Replicate

One-click model deployment

Easiest Setup

Run models in the cloud with simple API. No DevOps required.

Deploy Now

Disclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.