SAIL-VL2-8B-Thinking

149

6

8.0B

license:apache-2.0

by

BytedanceDouyinContent

Code Model

OTHER

8B params

New

149 downloads

Early-stage

Try on Hugging Face Add to Compare

Edge AI:

Mobile

Laptop

Server

18GB+ RAM

Mobile

Laptop

Server

Quick Summary

[📖 Technique Report] [🤗 SAIL-VL2-2B] [🤗 SAIL-VL2-8B] [🤗 SAIL-VL2-2B-Thinking] [🤗 SAIL-VL2-8B-Thinking] [💻 Github] We are very excited to introduce SAIL-V...

Device Compatibility

Mobile

4-6GB RAM

Laptop

16GB RAM

Server

GPU

Minimum Recommended

8GB+ RAM

Code Examples

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

🎬 Quick Startpythontransformers

import torch
from transformers import AutoTokenizer, AutoModel, AutoProcessor
from PIL import Image


model_path = "your model path"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
device = torch.cuda.current_device()
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16,).to(device)

print("##### with images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {"role": "user", "content": [{"type": "image", "image": 'image_path'}, 
    {"type": "text", "text": "describe the image" + cot_prompt}]}
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)

image_path = 'your image path'
image = Image.open(image_path)
inputs = processor(images=image, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)

generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)


print("##### without images")
cot_prompt = r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. The reasoning process MUST BE enclosed within <think> </think> tags. The final answer MUST BE put in \boxed{}."
messages = [
    {
        "role": "user",
        "content": [{"type": "text", "text": "中国的首都是哪里？" + cot_prompt}]
    }
]
text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
inputs = processor(images=None, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device).to(torch.bfloat16)
generated_ids = model.generate(**inputs, max_new_tokens=512)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response = response.split('<|im_end|>')[0].strip()
print(response)

Deploy This Model

Production-ready deployment in minutes

Together.ai

Instant API access to this model

Fastest API

Production-ready inference API. Start free, scale to millions.

Replicate

One-click model deployment

Easiest Setup

Run models in the cloud with simple API. No DevOps required.

Disclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.