MuFun Base
16
1
2 languages
license:apache-2.0
by
Yi3852
Audio Model
OTHER
2508.01178B params
New
16 downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
5606GB+ RAM
Mobile
Laptop
Server
Quick Summary
MuFun model proposed in Advancing the Foundation Model for Music Understanding Usage some audio processing packages like mutagen, torchaudio are needed to be i...
Device Compatibility
Mobile
4-6GB RAM
Laptop
16GB RAM
Server
GPU
Minimum Recommended
2336GB+ RAM
Code Examples
Usagepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
hf_path = 'Yi3852/MuFun-Base'
tokenizer = AutoTokenizer.from_pretrained(hf_path, use_fast=False)
device='cuda'
model = AutoModelForCausalLM.from_pretrained(hf_path, trust_remote_code=True, torch_dtype="bfloat16")
model.to(device)
# single audio
# during inference the audio(converted to a sequence of embeddings) will be placed in the position of <audio> tag in the prompt
aud="/path/to/your/song.mp3"
inp="\n<audio>Can you listen to this song and tell me its lyrics?"
res=model.chat(prompt=inp, audio_files=aud, tokenizer=tokenizer)
print(res)
# multiple audios
# for multiple songs each will be placed in the coresponding <audio> tag in the prompt
aud=["/path/to/your/song1.mp3", '/path/to/your/song2.mp3']
inp="\n<audio> This is song1. <audio> This is song2. Which song do you like more? Tell me the reason."
res=model.chat(prompt=inp, audio_files=aud, tokenizer=tokenizer)
print(res)
# analyze only a specific segment of audio using the segs parameter
# format is [start_time, end_time](in seconds), for multiple audios segs can be passed like [[0,30],[60,90]], [None,[0,30.0]]
aud="/path/to/your/song.mp3"
inp="\n<audio>How is the rhythm of this music clip?"
res=model.chat(prompt=inp, audio_files=aud, segs=[0,30.0], tokenizer=tokenizer)
print(res)
# set audio_files=None will work, however it is not recommended to use it as a text modelUsagepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
hf_path = 'Yi3852/MuFun-Base'
tokenizer = AutoTokenizer.from_pretrained(hf_path, use_fast=False)
device='cuda'
model = AutoModelForCausalLM.from_pretrained(hf_path, trust_remote_code=True, torch_dtype="bfloat16")
model.to(device)
# single audio
# during inference the audio(converted to a sequence of embeddings) will be placed in the position of <audio> tag in the prompt
aud="/path/to/your/song.mp3"
inp="\n<audio>Can you listen to this song and tell me its lyrics?"
res=model.chat(prompt=inp, audio_files=aud, tokenizer=tokenizer)
print(res)
# multiple audios
# for multiple songs each will be placed in the coresponding <audio> tag in the prompt
aud=["/path/to/your/song1.mp3", '/path/to/your/song2.mp3']
inp="\n<audio> This is song1. <audio> This is song2. Which song do you like more? Tell me the reason."
res=model.chat(prompt=inp, audio_files=aud, tokenizer=tokenizer)
print(res)
# analyze only a specific segment of audio using the segs parameter
# format is [start_time, end_time](in seconds), for multiple audios segs can be passed like [[0,30],[60,90]], [None,[0,30.0]]
aud="/path/to/your/song.mp3"
inp="\n<audio>How is the rhythm of this music clip?"
res=model.chat(prompt=inp, audio_files=aud, segs=[0,30.0], tokenizer=tokenizer)
print(res)
# set audio_files=None will work, however it is not recommended to use it as a text modelUsagepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
hf_path = 'Yi3852/MuFun-Base'
tokenizer = AutoTokenizer.from_pretrained(hf_path, use_fast=False)
device='cuda'
model = AutoModelForCausalLM.from_pretrained(hf_path, trust_remote_code=True, torch_dtype="bfloat16")
model.to(device)
# single audio
# during inference the audio(converted to a sequence of embeddings) will be placed in the position of <audio> tag in the prompt
aud="/path/to/your/song.mp3"
inp="\n<audio>Can you listen to this song and tell me its lyrics?"
res=model.chat(prompt=inp, audio_files=aud, tokenizer=tokenizer)
print(res)
# multiple audios
# for multiple songs each will be placed in the coresponding <audio> tag in the prompt
aud=["/path/to/your/song1.mp3", '/path/to/your/song2.mp3']
inp="\n<audio> This is song1. <audio> This is song2. Which song do you like more? Tell me the reason."
res=model.chat(prompt=inp, audio_files=aud, tokenizer=tokenizer)
print(res)
# analyze only a specific segment of audio using the segs parameter
# format is [start_time, end_time](in seconds), for multiple audios segs can be passed like [[0,30],[60,90]], [None,[0,30.0]]
aud="/path/to/your/song.mp3"
inp="\n<audio>How is the rhythm of this music clip?"
res=model.chat(prompt=inp, audio_files=aud, segs=[0,30.0], tokenizer=tokenizer)
print(res)
# set audio_files=None will work, however it is not recommended to use it as a text modelUsagepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
hf_path = 'Yi3852/MuFun-Base'
tokenizer = AutoTokenizer.from_pretrained(hf_path, use_fast=False)
device='cuda'
model = AutoModelForCausalLM.from_pretrained(hf_path, trust_remote_code=True, torch_dtype="bfloat16")
model.to(device)
# single audio
# during inference the audio(converted to a sequence of embeddings) will be placed in the position of <audio> tag in the prompt
aud="/path/to/your/song.mp3"
inp="\n<audio>Can you listen to this song and tell me its lyrics?"
res=model.chat(prompt=inp, audio_files=aud, tokenizer=tokenizer)
print(res)
# multiple audios
# for multiple songs each will be placed in the coresponding <audio> tag in the prompt
aud=["/path/to/your/song1.mp3", '/path/to/your/song2.mp3']
inp="\n<audio> This is song1. <audio> This is song2. Which song do you like more? Tell me the reason."
res=model.chat(prompt=inp, audio_files=aud, tokenizer=tokenizer)
print(res)
# analyze only a specific segment of audio using the segs parameter
# format is [start_time, end_time](in seconds), for multiple audios segs can be passed like [[0,30],[60,90]], [None,[0,30.0]]
aud="/path/to/your/song.mp3"
inp="\n<audio>How is the rhythm of this music clip?"
res=model.chat(prompt=inp, audio_files=aud, segs=[0,30.0], tokenizer=tokenizer)
print(res)
# set audio_files=None will work, however it is not recommended to use it as a text modelUsagepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
hf_path = 'Yi3852/MuFun-Base'
tokenizer = AutoTokenizer.from_pretrained(hf_path, use_fast=False)
device='cuda'
model = AutoModelForCausalLM.from_pretrained(hf_path, trust_remote_code=True, torch_dtype="bfloat16")
model.to(device)
# single audio
# during inference the audio(converted to a sequence of embeddings) will be placed in the position of <audio> tag in the prompt
aud="/path/to/your/song.mp3"
inp="\n<audio>Can you listen to this song and tell me its lyrics?"
res=model.chat(prompt=inp, audio_files=aud, tokenizer=tokenizer)
print(res)
# multiple audios
# for multiple songs each will be placed in the coresponding <audio> tag in the prompt
aud=["/path/to/your/song1.mp3", '/path/to/your/song2.mp3']
inp="\n<audio> This is song1. <audio> This is song2. Which song do you like more? Tell me the reason."
res=model.chat(prompt=inp, audio_files=aud, tokenizer=tokenizer)
print(res)
# analyze only a specific segment of audio using the segs parameter
# format is [start_time, end_time](in seconds), for multiple audios segs can be passed like [[0,30],[60,90]], [None,[0,30.0]]
aud="/path/to/your/song.mp3"
inp="\n<audio>How is the rhythm of this music clip?"
res=model.chat(prompt=inp, audio_files=aud, segs=[0,30.0], tokenizer=tokenizer)
print(res)
# set audio_files=None will work, however it is not recommended to use it as a text modelUsagepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
hf_path = 'Yi3852/MuFun-Base'
tokenizer = AutoTokenizer.from_pretrained(hf_path, use_fast=False)
device='cuda'
model = AutoModelForCausalLM.from_pretrained(hf_path, trust_remote_code=True, torch_dtype="bfloat16")
model.to(device)
# single audio
# during inference the audio(converted to a sequence of embeddings) will be placed in the position of <audio> tag in the prompt
aud="/path/to/your/song.mp3"
inp="\n<audio>Can you listen to this song and tell me its lyrics?"
res=model.chat(prompt=inp, audio_files=aud, tokenizer=tokenizer)
print(res)
# multiple audios
# for multiple songs each will be placed in the coresponding <audio> tag in the prompt
aud=["/path/to/your/song1.mp3", '/path/to/your/song2.mp3']
inp="\n<audio> This is song1. <audio> This is song2. Which song do you like more? Tell me the reason."
res=model.chat(prompt=inp, audio_files=aud, tokenizer=tokenizer)
print(res)
# analyze only a specific segment of audio using the segs parameter
# format is [start_time, end_time](in seconds), for multiple audios segs can be passed like [[0,30],[60,90]], [None,[0,30.0]]
aud="/path/to/your/song.mp3"
inp="\n<audio>How is the rhythm of this music clip?"
res=model.chat(prompt=inp, audio_files=aud, segs=[0,30.0], tokenizer=tokenizer)
print(res)
# set audio_files=None will work, however it is not recommended to use it as a text modelUsagepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
hf_path = 'Yi3852/MuFun-Base'
tokenizer = AutoTokenizer.from_pretrained(hf_path, use_fast=False)
device='cuda'
model = AutoModelForCausalLM.from_pretrained(hf_path, trust_remote_code=True, torch_dtype="bfloat16")
model.to(device)
# single audio
# during inference the audio(converted to a sequence of embeddings) will be placed in the position of <audio> tag in the prompt
aud="/path/to/your/song.mp3"
inp="\n<audio>Can you listen to this song and tell me its lyrics?"
res=model.chat(prompt=inp, audio_files=aud, tokenizer=tokenizer)
print(res)
# multiple audios
# for multiple songs each will be placed in the coresponding <audio> tag in the prompt
aud=["/path/to/your/song1.mp3", '/path/to/your/song2.mp3']
inp="\n<audio> This is song1. <audio> This is song2. Which song do you like more? Tell me the reason."
res=model.chat(prompt=inp, audio_files=aud, tokenizer=tokenizer)
print(res)
# analyze only a specific segment of audio using the segs parameter
# format is [start_time, end_time](in seconds), for multiple audios segs can be passed like [[0,30],[60,90]], [None,[0,30.0]]
aud="/path/to/your/song.mp3"
inp="\n<audio>How is the rhythm of this music clip?"
res=model.chat(prompt=inp, audio_files=aud, segs=[0,30.0], tokenizer=tokenizer)
print(res)
# set audio_files=None will work, however it is not recommended to use it as a text modelUsagepythontransformers
from transformers import AutoTokenizer, AutoModelForCausalLM
hf_path = 'Yi3852/MuFun-Base'
tokenizer = AutoTokenizer.from_pretrained(hf_path, use_fast=False)
device='cuda'
model = AutoModelForCausalLM.from_pretrained(hf_path, trust_remote_code=True, torch_dtype="bfloat16")
model.to(device)
# single audio
# during inference the audio(converted to a sequence of embeddings) will be placed in the position of <audio> tag in the prompt
aud="/path/to/your/song.mp3"
inp="\n<audio>Can you listen to this song and tell me its lyrics?"
res=model.chat(prompt=inp, audio_files=aud, tokenizer=tokenizer)
print(res)
# multiple audios
# for multiple songs each will be placed in the coresponding <audio> tag in the prompt
aud=["/path/to/your/song1.mp3", '/path/to/your/song2.mp3']
inp="\n<audio> This is song1. <audio> This is song2. Which song do you like more? Tell me the reason."
res=model.chat(prompt=inp, audio_files=aud, tokenizer=tokenizer)
print(res)
# analyze only a specific segment of audio using the segs parameter
# format is [start_time, end_time](in seconds), for multiple audios segs can be passed like [[0,30],[60,90]], [None,[0,30.0]]
aud="/path/to/your/song.mp3"
inp="\n<audio>How is the rhythm of this music clip?"
res=model.chat(prompt=inp, audio_files=aud, segs=[0,30.0], tokenizer=tokenizer)
print(res)
# set audio_files=None will work, however it is not recommended to use it as a text modelDeploy This Model
Production-ready deployment in minutes
Together.ai
Instant API access to this model
Production-ready inference API. Start free, scale to millions.
Try Free APIReplicate
One-click model deployment
Run models in the cloud with simple API. No DevOps required.
Deploy NowDisclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.