Holo1-3B-GGUF

214
1
1 language
BF16
by
Mungert
Multimodal
OTHER
3B params
New
214 downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
7GB+ RAM
Mobile
Laptop
Server
Quick Summary

AI model with specialized capabilities.

Device Compatibility

Mobile
4-6GB RAM
Laptop
16GB RAM
Server
GPU
Minimum Recommended
3GB+ RAM

Code Examples

default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal

from transformers import AutoModelForImageTextToText, AutoProcessor

# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
    "Hcompany/Holo1-3B",
    torch_dtype="auto",
    # torch_dtype=torch.bfloat16,
    # attn_implementation="flash_attention_2",
    device_map="auto",
)

# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-3B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)

# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
    # Preparation for inference
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[text],
        images=image,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize

# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-3B/resolve/main/calendar_example.jpg" 
image = Image.open(requests.get(image_url, stream=True).raw)

# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
    image.height,
    image.width,
    factor=image_processor.patch_size * image_processor.merge_size,
    min_pixels=image_processor.min_pixels,
    max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None)  # type: ignore
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
type: ignorepython
import json
from . import navigation

task = "Book a hotel in Paris on August 3rd for 3 nights"
prompt = navigation.get_navigation_prompt(task, image, step=1)
navigation_str = run_inference(prompt)[0]
navigation = navigation.NavigationStep(**json.loads(navigation_str))
print(navigation)
# Expected NavigationStep(note='', thought='I need to select the check-out date as August 3rd and then proceed to search for hotels.', action=ClickElementAction(action='click_element', element='August 3rd on the calendar', x=777, y=282))
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with click(x, y)python
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt(image, instruction)
coordinates = run_inference(prompt)[0]
print(coordinates)
# Expected Click(352, 348)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)
Localization with Structured Outputpython
import json
from . import localization

instruction = "Select July 14th as the check-out date"
prompt = localization.get_localization_prompt_structured_output(image, instruction)
coordinates_structured_str = run_inference(prompt)[0]
coordinates_structured = localization.ClickAction(**json.loads(coordinates_structured_str))
print(coordinates_structured)
# Expected ClickAction(action='click', x=352, y=340)

Deploy This Model

Production-ready deployment in minutes

Together.ai

Instant API access to this model

Fastest API

Production-ready inference API. Start free, scale to millions.

Try Free API

Replicate

One-click model deployment

Easiest Setup

Run models in the cloud with simple API. No DevOps required.

Deploy Now

Disclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.