TowerVision-2B

94

2

2.0B

19 languages

license:cc-by-nc-sa-4.0

by

utter-project

Image Model

OTHER

2B params

New

94 downloads

Early-stage

Try on Hugging Face Add to Compare

Edge AI:

Mobile

Laptop

Server

5GB+ RAM

Mobile

Laptop

Server

Quick Summary

AI model with specialized capabilities.

Device Compatibility

Mobile

4-6GB RAM

Laptop

16GB RAM

Server

GPU

Minimum Recommended

2GB+ RAM

Code Examples

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Quick Start with Transformerspythontransformers

from transformers import (
    LlavaNextProcessor,
    LlavaNextForConditionalGeneration
)
import requests
from PIL import Image

model_id = "utter-project/TowerVision-2B"  # or any other variant

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    prompt = processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    return prompt

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# img url
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)

# Multilingual prompts - TowerVision supports 20+ languages!
prompt = prepare_prompt("Is this person really big, or is this building just super small?")

# Prepare inputs
inputs = processor(
    text=prompt, images=image, return_tensors="pt"
).to(model.device)

# Generate response ids
gen_tokens = model.generate(**inputs, max_new_tokens=512)
# Decode response
print(processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True))

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Batch Inference with Transformerspython

def prepare_prompts(queries):
    prompts = []
    for query in queries:
        conversation = [
            {
                "role": "user", 
                "content": f"<image>\n{query}"
            }
        ]
        
        # Format message with the towervision chat template
        prompt = processor.apply_chat_template(
            conversation, 
            tokenize=False,
            add_generation_prompt=True
        )
        prompts.append(prompt)
    return prompts

# we recommend using "bfloat16" as torch_dtype
kwargs = {
    "torch_dtype": "bfloat16",
    "device_map": "auto",
}
processor = LlavaNextProcessor.from_pretrained(model_id)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **kwargs)

# Sample images and queries for batch processing
img_urls = [
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
    "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f",
]

queries = [
    "Is this person really big, or is this building just super small?",
    "Where was this photo taken?"
]

# Load images
images = []
for url in img_urls[:batch_size]:
    image = Image.open(requests.get(url, stream=True).raw)
    images.append(image)

# Prepare prompts
prompts = prepare_prompts(queries[:batch_size])

# Prepare batch inputs
inputs = processor(
    text=prompts, 
    images=images, 
    return_tensors="pt",
    padding=True
).to(model.device)

# Generate response ids for batch
gen_tokens = model.generate(**inputs, max_new_tokens=512, do_sample=False)

# Decode responses
print(f"Batch processing {len(images)} images:")
print("-" * 50)

for i in range(len(images)):
    input_length = inputs.input_ids[i].shape[0]
    response = processor.tokenizer.decode(
        gen_tokens[i][input_length:], 
        skip_special_tokens=True
    )
    print(f"Response: {response}")
    print("-" * 50)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Pipeline Usagepythontransformers

from transformers import pipeline
from PIL import Image
import requests


pipe = pipeline(
    model="utter-project/TowerVision-9B", 
    task="image-text-to-text", 
    device_map="auto",
    dtype="bfloat16"
)

def prepare_prompt(query):
    conversation = [
        {
            "role": "user", 
            "content": f"<image>\n{query}"
        }
    ]
    
    # Format message with the towervision chat template
    return pipe.processor.apply_chat_template(
        conversation, 
        tokenize=False,
        add_generation_prompt=True
    )
    
    
img_url = "https://cms.mistral.ai/assets/a10b924e-56b3-4359-bf6c-571107811c8f"
image = Image.open(requests.get(img_url, stream=True).raw)
text = prepare_prompt("Is this person really big, or is this building just super small?")

outputs = pipe(text=text, images=image, max_new_tokens=300, return_full_text=False)
print(outputs)

Deploy This Model

Production-ready deployment in minutes

Together.ai

Instant API access to this model

Fastest API

Production-ready inference API. Start free, scale to millions.

Replicate

One-click model deployment

Easiest Setup

Run models in the cloud with simple API. No DevOps required.

Disclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.