Ming UniVision 16B A3B

Name: Ming UniVision 16B A3B
Author: inclusionAI

349

16.0B

2 languages

license:apache-2.0

inclusionAI

Language Model

OTHER

16B params

New

349 downloads

Early-stage

Try on Hugging Face Add to Compare

Edge AI:

Mobile

Laptop

Server

36GB+ RAM

Mobile

Laptop

Server

Quick Summary

Ming-UniVision: Joint Image Understanding and Generation with a Unified Continuous Tokenizer 📑 Technical Report ｜📖 Project Page ｜🤗 Hugging Face ｜ 🤖 ModelSc...

Device Compatibility

Mobile

4-6GB RAM

Laptop

16GB RAM

Server

GPU

Minimum Recommended

15GB+ RAM

Code Examples

Usagepython

from mingunivisioninfer import MingUniVisionInfer
model = MingUniVisionInfer("inclusionAI/Ming-UniVision-16B-A3B")

# single round generation
image_gen_prompt = "Please generate the corresponding image based on the description. A cute girl."
messages = [{
  "role": "HUMAN",
  "content": [{"type": "text", "text": image_gen_prompt},],
}]
output_text = model.generate(messages, max_new_tokens=512, output_image_prefix="a_cute_girl")
model.reset_inner_state()

# single ground understanding
messages = [{
  "role": "HUMAN",
  "content": [
    {"type": "image", "image": "a_cute_girl.png"},
    {"type": "text", "text": "Please describe the picture in detail."},
  ],
}]
output_text = model.generate(messages, max_new_tokens=512)
print(output_text)
model.reset_inner_state()

# multi-round editing
messages = [{
  "role": "HUMAN",
  "content": [
    {"type": "image", "image": "a_cute_girl.png"},
    {"type": "text", "text": "Given the edit instruction: Change the color of her cloth to red, please identify the editing region"},
  ],
}]
output_text = model.generate(messages, max_new_tokens=512, for_edit=True, output_image_prefix="edit_round_0")

messages = [{
  "role": "HUMAN",
  "content": [
    {"type": "text", "text": "Change the color of her cloth to red."},
  ],
}]
output_text = model.generate(messages, max_new_tokens=512, for_edit=True, output_image_prefix="edit_round_1")

messages = [{
  "role": "HUMAN",
  "content": [
    {"type": "text", "text": "Refine the image for better clarity."},
  ],
}]
output_text = model.generate(messages, max_new_tokens=512, for_edit=True, output_image_prefix="edit_round_2")

model.reset_inner_state()

# single round text-based conversation
messages = [{
  "role": "HUMAN",
  "content": [
    {"type": "text", "text": "请详细介绍鹦鹉的习性。"},
  ],
}]

output_text = model.generate(messages, max_new_tokens=512)
print(output_text)
model.reset_inner_state()

Usagepython

from mingunivisioninfer import MingUniVisionInfer
model = MingUniVisionInfer("inclusionAI/Ming-UniVision-16B-A3B")

# single round generation
image_gen_prompt = "Please generate the corresponding image based on the description. A cute girl."
messages = [{
  "role": "HUMAN",
  "content": [{"type": "text", "text": image_gen_prompt},],
}]
output_text = model.generate(messages, max_new_tokens=512, output_image_prefix="a_cute_girl")
model.reset_inner_state()

# single ground understanding
messages = [{
  "role": "HUMAN",
  "content": [
    {"type": "image", "image": "a_cute_girl.png"},
    {"type": "text", "text": "Please describe the picture in detail."},
  ],
}]
output_text = model.generate(messages, max_new_tokens=512)
print(output_text)
model.reset_inner_state()

# multi-round editing
messages = [{
  "role": "HUMAN",
  "content": [
    {"type": "image", "image": "a_cute_girl.png"},
    {"type": "text", "text": "Given the edit instruction: Change the color of her cloth to red, please identify the editing region"},
  ],
}]
output_text = model.generate(messages, max_new_tokens=512, for_edit=True, output_image_prefix="edit_round_0")

messages = [{
  "role": "HUMAN",
  "content": [
    {"type": "text", "text": "Change the color of her cloth to red."},
  ],
}]
output_text = model.generate(messages, max_new_tokens=512, for_edit=True, output_image_prefix="edit_round_1")

messages = [{
  "role": "HUMAN",
  "content": [
    {"type": "text", "text": "Refine the image for better clarity."},
  ],
}]
output_text = model.generate(messages, max_new_tokens=512, for_edit=True, output_image_prefix="edit_round_2")

model.reset_inner_state()

# single round text-based conversation
messages = [{
  "role": "HUMAN",
  "content": [
    {"type": "text", "text": "请详细介绍鹦鹉的习性。"},
  ],
}]

output_text = model.generate(messages, max_new_tokens=512)
print(output_text)
model.reset_inner_state()

Usagepython

from mingunivisioninfer import MingUniVisionInfer
model = MingUniVisionInfer("inclusionAI/Ming-UniVision-16B-A3B")

# single round generation
image_gen_prompt = "Please generate the corresponding image based on the description. A cute girl."
messages = [{
  "role": "HUMAN",
  "content": [{"type": "text", "text": image_gen_prompt},],
}]
output_text = model.generate(messages, max_new_tokens=512, output_image_prefix="a_cute_girl")
model.reset_inner_state()

# single ground understanding
messages = [{
  "role": "HUMAN",
  "content": [
    {"type": "image", "image": "a_cute_girl.png"},
    {"type": "text", "text": "Please describe the picture in detail."},
  ],
}]
output_text = model.generate(messages, max_new_tokens=512)
print(output_text)
model.reset_inner_state()

# multi-round editing
messages = [{
  "role": "HUMAN",
  "content": [
    {"type": "image", "image": "a_cute_girl.png"},
    {"type": "text", "text": "Given the edit instruction: Change the color of her cloth to red, please identify the editing region"},
  ],
}]
output_text = model.generate(messages, max_new_tokens=512, for_edit=True, output_image_prefix="edit_round_0")

messages = [{
  "role": "HUMAN",
  "content": [
    {"type": "text", "text": "Change the color of her cloth to red."},
  ],
}]
output_text = model.generate(messages, max_new_tokens=512, for_edit=True, output_image_prefix="edit_round_1")

messages = [{
  "role": "HUMAN",
  "content": [
    {"type": "text", "text": "Refine the image for better clarity."},
  ],
}]
output_text = model.generate(messages, max_new_tokens=512, for_edit=True, output_image_prefix="edit_round_2")

model.reset_inner_state()

# single round text-based conversation
messages = [{
  "role": "HUMAN",
  "content": [
    {"type": "text", "text": "请详细介绍鹦鹉的习性。"},
  ],
}]

output_text = model.generate(messages, max_new_tokens=512)
print(output_text)
model.reset_inner_state()

Usagepython

from mingunivisioninfer import MingUniVisionInfer
model = MingUniVisionInfer("inclusionAI/Ming-UniVision-16B-A3B")

# single round generation
image_gen_prompt = "Please generate the corresponding image based on the description. A cute girl."
messages = [{
  "role": "HUMAN",
  "content": [{"type": "text", "text": image_gen_prompt},],
}]
output_text = model.generate(messages, max_new_tokens=512, output_image_prefix="a_cute_girl")
model.reset_inner_state()

# single ground understanding
messages = [{
  "role": "HUMAN",
  "content": [
    {"type": "image", "image": "a_cute_girl.png"},
    {"type": "text", "text": "Please describe the picture in detail."},
  ],
}]
output_text = model.generate(messages, max_new_tokens=512)
print(output_text)
model.reset_inner_state()

# multi-round editing
messages = [{
  "role": "HUMAN",
  "content": [
    {"type": "image", "image": "a_cute_girl.png"},
    {"type": "text", "text": "Given the edit instruction: Change the color of her cloth to red, please identify the editing region"},
  ],
}]
output_text = model.generate(messages, max_new_tokens=512, for_edit=True, output_image_prefix="edit_round_0")

messages = [{
  "role": "HUMAN",
  "content": [
    {"type": "text", "text": "Change the color of her cloth to red."},
  ],
}]
output_text = model.generate(messages, max_new_tokens=512, for_edit=True, output_image_prefix="edit_round_1")

messages = [{
  "role": "HUMAN",
  "content": [
    {"type": "text", "text": "Refine the image for better clarity."},
  ],
}]
output_text = model.generate(messages, max_new_tokens=512, for_edit=True, output_image_prefix="edit_round_2")

model.reset_inner_state()

# single round text-based conversation
messages = [{
  "role": "HUMAN",
  "content": [
    {"type": "text", "text": "请详细介绍鹦鹉的习性。"},
  ],
}]

output_text = model.generate(messages, max_new_tokens=512)
print(output_text)
model.reset_inner_state()

Usagepython

from mingunivisioninfer import MingUniVisionInfer
model = MingUniVisionInfer("inclusionAI/Ming-UniVision-16B-A3B")

# single round generation
image_gen_prompt = "Please generate the corresponding image based on the description. A cute girl."
messages = [{
  "role": "HUMAN",
  "content": [{"type": "text", "text": image_gen_prompt},],
}]
output_text = model.generate(messages, max_new_tokens=512, output_image_prefix="a_cute_girl")
model.reset_inner_state()

# single ground understanding
messages = [{
  "role": "HUMAN",
  "content": [
    {"type": "image", "image": "a_cute_girl.png"},
    {"type": "text", "text": "Please describe the picture in detail."},
  ],
}]
output_text = model.generate(messages, max_new_tokens=512)
print(output_text)
model.reset_inner_state()

# multi-round editing
messages = [{
  "role": "HUMAN",
  "content": [
    {"type": "image", "image": "a_cute_girl.png"},
    {"type": "text", "text": "Given the edit instruction: Change the color of her cloth to red, please identify the editing region"},
  ],
}]
output_text = model.generate(messages, max_new_tokens=512, for_edit=True, output_image_prefix="edit_round_0")

messages = [{
  "role": "HUMAN",
  "content": [
    {"type": "text", "text": "Change the color of her cloth to red."},
  ],
}]
output_text = model.generate(messages, max_new_tokens=512, for_edit=True, output_image_prefix="edit_round_1")

messages = [{
  "role": "HUMAN",
  "content": [
    {"type": "text", "text": "Refine the image for better clarity."},
  ],
}]
output_text = model.generate(messages, max_new_tokens=512, for_edit=True, output_image_prefix="edit_round_2")

model.reset_inner_state()

# single round text-based conversation
messages = [{
  "role": "HUMAN",
  "content": [
    {"type": "text", "text": "请详细介绍鹦鹉的习性。"},
  ],
}]

output_text = model.generate(messages, max_new_tokens=512)
print(output_text)
model.reset_inner_state()

Usagepython

from mingunivisioninfer import MingUniVisionInfer
model = MingUniVisionInfer("inclusionAI/Ming-UniVision-16B-A3B")

# single round generation
image_gen_prompt = "Please generate the corresponding image based on the description. A cute girl."
messages = [{
  "role": "HUMAN",
  "content": [{"type": "text", "text": image_gen_prompt},],
}]
output_text = model.generate(messages, max_new_tokens=512, output_image_prefix="a_cute_girl")
model.reset_inner_state()

# single ground understanding
messages = [{
  "role": "HUMAN",
  "content": [
    {"type": "image", "image": "a_cute_girl.png"},
    {"type": "text", "text": "Please describe the picture in detail."},
  ],
}]
output_text = model.generate(messages, max_new_tokens=512)
print(output_text)
model.reset_inner_state()

# multi-round editing
messages = [{
  "role": "HUMAN",
  "content": [
    {"type": "image", "image": "a_cute_girl.png"},
    {"type": "text", "text": "Given the edit instruction: Change the color of her cloth to red, please identify the editing region"},
  ],
}]
output_text = model.generate(messages, max_new_tokens=512, for_edit=True, output_image_prefix="edit_round_0")

messages = [{
  "role": "HUMAN",
  "content": [
    {"type": "text", "text": "Change the color of her cloth to red."},
  ],
}]
output_text = model.generate(messages, max_new_tokens=512, for_edit=True, output_image_prefix="edit_round_1")

messages = [{
  "role": "HUMAN",
  "content": [
    {"type": "text", "text": "Refine the image for better clarity."},
  ],
}]
output_text = model.generate(messages, max_new_tokens=512, for_edit=True, output_image_prefix="edit_round_2")

model.reset_inner_state()

# single round text-based conversation
messages = [{
  "role": "HUMAN",
  "content": [
    {"type": "text", "text": "请详细介绍鹦鹉的习性。"},
  ],
}]

output_text = model.generate(messages, max_new_tokens=512)
print(output_text)
model.reset_inner_state()

Usagepython

from mingunivisioninfer import MingUniVisionInfer
model = MingUniVisionInfer("inclusionAI/Ming-UniVision-16B-A3B")

# single round generation
image_gen_prompt = "Please generate the corresponding image based on the description. A cute girl."
messages = [{
  "role": "HUMAN",
  "content": [{"type": "text", "text": image_gen_prompt},],
}]
output_text = model.generate(messages, max_new_tokens=512, output_image_prefix="a_cute_girl")
model.reset_inner_state()

# single ground understanding
messages = [{
  "role": "HUMAN",
  "content": [
    {"type": "image", "image": "a_cute_girl.png"},
    {"type": "text", "text": "Please describe the picture in detail."},
  ],
}]
output_text = model.generate(messages, max_new_tokens=512)
print(output_text)
model.reset_inner_state()

# multi-round editing
messages = [{
  "role": "HUMAN",
  "content": [
    {"type": "image", "image": "a_cute_girl.png"},
    {"type": "text", "text": "Given the edit instruction: Change the color of her cloth to red, please identify the editing region"},
  ],
}]
output_text = model.generate(messages, max_new_tokens=512, for_edit=True, output_image_prefix="edit_round_0")

messages = [{
  "role": "HUMAN",
  "content": [
    {"type": "text", "text": "Change the color of her cloth to red."},
  ],
}]
output_text = model.generate(messages, max_new_tokens=512, for_edit=True, output_image_prefix="edit_round_1")

messages = [{
  "role": "HUMAN",
  "content": [
    {"type": "text", "text": "Refine the image for better clarity."},
  ],
}]
output_text = model.generate(messages, max_new_tokens=512, for_edit=True, output_image_prefix="edit_round_2")

model.reset_inner_state()

# single round text-based conversation
messages = [{
  "role": "HUMAN",
  "content": [
    {"type": "text", "text": "请详细介绍鹦鹉的习性。"},
  ],
}]

output_text = model.generate(messages, max_new_tokens=512)
print(output_text)
model.reset_inner_state()

Referencetext

@article{huang2025mingunivision,
  title={Ming-UniVision: Joint Image Understanding and Generation with a Unified Continuous Tokenizer},
  author={Huang, Ziyuan and Zheng, DanDan and Zou, Cheng and Liu, Rui and Wang, Xiaolong and Ji, Kaixiang and Chai, Weilong and Sun, Jianxin and Wang, Libin and Lv, Yongjie and Huang, Taozhi and Liu, Jiajia and Guo, Qingpei and Yang, Ming and Chen, Jingdong and Zhou, Jun},
  journal={arXiv preprint arXiv:2510.06590},
  year={2025}
}

Referencetext

@article{huang2025mingunivision,
  title={Ming-UniVision: Joint Image Understanding and Generation with a Unified Continuous Tokenizer},
  author={Huang, Ziyuan and Zheng, DanDan and Zou, Cheng and Liu, Rui and Wang, Xiaolong and Ji, Kaixiang and Chai, Weilong and Sun, Jianxin and Wang, Libin and Lv, Yongjie and Huang, Taozhi and Liu, Jiajia and Guo, Qingpei and Yang, Ming and Chen, Jingdong and Zhou, Jun},
  journal={arXiv preprint arXiv:2510.06590},
  year={2025}
}

Referencetext

@article{huang2025mingunivision,
  title={Ming-UniVision: Joint Image Understanding and Generation with a Unified Continuous Tokenizer},
  author={Huang, Ziyuan and Zheng, DanDan and Zou, Cheng and Liu, Rui and Wang, Xiaolong and Ji, Kaixiang and Chai, Weilong and Sun, Jianxin and Wang, Libin and Lv, Yongjie and Huang, Taozhi and Liu, Jiajia and Guo, Qingpei and Yang, Ming and Chen, Jingdong and Zhou, Jun},
  journal={arXiv preprint arXiv:2510.06590},
  year={2025}
}

Referencetext

@article{huang2025mingunivision,
  title={Ming-UniVision: Joint Image Understanding and Generation with a Unified Continuous Tokenizer},
  author={Huang, Ziyuan and Zheng, DanDan and Zou, Cheng and Liu, Rui and Wang, Xiaolong and Ji, Kaixiang and Chai, Weilong and Sun, Jianxin and Wang, Libin and Lv, Yongjie and Huang, Taozhi and Liu, Jiajia and Guo, Qingpei and Yang, Ming and Chen, Jingdong and Zhou, Jun},
  journal={arXiv preprint arXiv:2510.06590},
  year={2025}
}

Referencetext

@article{huang2025mingunivision,
  title={Ming-UniVision: Joint Image Understanding and Generation with a Unified Continuous Tokenizer},
  author={Huang, Ziyuan and Zheng, DanDan and Zou, Cheng and Liu, Rui and Wang, Xiaolong and Ji, Kaixiang and Chai, Weilong and Sun, Jianxin and Wang, Libin and Lv, Yongjie and Huang, Taozhi and Liu, Jiajia and Guo, Qingpei and Yang, Ming and Chen, Jingdong and Zhou, Jun},
  journal={arXiv preprint arXiv:2510.06590},
  year={2025}
}

Referencetext

@article{huang2025mingunivision,
  title={Ming-UniVision: Joint Image Understanding and Generation with a Unified Continuous Tokenizer},
  author={Huang, Ziyuan and Zheng, DanDan and Zou, Cheng and Liu, Rui and Wang, Xiaolong and Ji, Kaixiang and Chai, Weilong and Sun, Jianxin and Wang, Libin and Lv, Yongjie and Huang, Taozhi and Liu, Jiajia and Guo, Qingpei and Yang, Ming and Chen, Jingdong and Zhou, Jun},
  journal={arXiv preprint arXiv:2510.06590},
  year={2025}
}

Referencetext

@article{huang2025mingunivision,
  title={Ming-UniVision: Joint Image Understanding and Generation with a Unified Continuous Tokenizer},
  author={Huang, Ziyuan and Zheng, DanDan and Zou, Cheng and Liu, Rui and Wang, Xiaolong and Ji, Kaixiang and Chai, Weilong and Sun, Jianxin and Wang, Libin and Lv, Yongjie and Huang, Taozhi and Liu, Jiajia and Guo, Qingpei and Yang, Ming and Chen, Jingdong and Zhou, Jun},
  journal={arXiv preprint arXiv:2510.06590},
  year={2025}
}

Deploy This Model

Production-ready deployment in minutes

Together.ai

Instant API access to this model

Fastest API

Production-ready inference API. Start free, scale to millions.

Try Free API

Replicate

One-click model deployment

Easiest Setup

Run models in the cloud with simple API. No DevOps required.

Deploy Now

Disclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.