SAIL-VL2-8B-Thinking

149
6
8.0B
license:apache-2.0
by
BytedanceDouyinContent
Code Model
OTHER
8B params
New
149 downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
18GB+ RAM
Mobile
Laptop
Server
Quick Summary

[📖 Technique Report] [🤗 SAIL-VL2-2B] [🤗 SAIL-VL2-8B] [🤗 SAIL-VL2-2B-Thinking] [🤗 SAIL-VL2-8B-Thinking] [💻 Github] We are very excited to introduce SAIL-V...

Device Compatibility

Mobile
4-6GB RAM
Laptop
16GB RAM
Server
GPU
Minimum Recommended
8GB+ RAM

Code Examples

🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)
🎬 Quick Startpythontransformers
import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里?" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

Deploy This Model

Production-ready deployment in minutes

Together.ai

Instant API access to this model

Fastest API

Production-ready inference API. Start free, scale to millions.

Try Free API

Replicate

One-click model deployment

Easiest Setup

Run models in the cloud with simple API. No DevOps required.

Deploy Now

Disclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.