Holo1-7B-GGUF
662
7.0B
1 language
BF16
license:apache-2.0
by
Mungert
Multimodal
OTHER
7B params
New
662 downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
16GB+ RAM
Mobile
Laptop
Server
Quick Summary
AI model with specialized capabilities.
Device Compatibility
Mobile
4-6GB RAM
Laptop
16GB RAM
Server
GPU
Minimum Recommended
7GB+ RAM
Code Examples
default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)default: Load the model on the available device(s)pythontransformers
import json
import os
from typing import Any, Literal
from transformers import AutoModelForImageTextToText, AutoProcessor
# default: Load the model on the available device(s)
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
model = AutoModelForImageTextToText.from_pretrained(
"Hcompany/Holo1-7B",
torch_dtype="auto",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
device_map="auto",
)
# default processor
processor = AutoProcessor.from_pretrained("Hcompany/Holo1-7B")
# The default range for the number of visual tokens per image in the model is 4-1280.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)
# Helper function to run inference
def run_inference(messages: list[dict[str, Any]]) -> str:
# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[text],
images=image,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
return processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"Prepare image and instructionpythontransformers
from PIL import Image
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
import requests
# Prepare image and instruction
image_url = "https://huggingface.co/Hcompany/Holo1-7B/resolve/main/calendar_example.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Resize the image so that predicted absolute coordinates match the size of the image.
image_processor = processor.image_processor
resized_height, resized_width = smart_resize(
image.height,
image.width,
factor=image_processor.patch_size * image_processor.merge_size,
min_pixels=image_processor.min_pixels,
max_pixels=image_processor.max_pixels,
)
image = image.resize(size=(resized_width, resized_height), resample=None) # type: ignore
instruction = "Select July 14th as the check-out date"type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)type: ignorepython
def get_localization_prompt(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position as Click(x, y) with x num pixels from the left edge and y num pixels from the top edge."
return [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
}
]
messages = get_localization_prompt(image, instruction)
coordinates_str = run_inference(messages)[0]
print(coordinates_str)
# Expected Click(352, 348)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Expected Click(352, 348)python
from pydantic import BaseModel, ConfigDict
class FunctionDefinition(BaseModel):
"""Function definition data structure.
Attributes:
name: name of the function.
description: description of the function.
parameters: JSON schema for the function parameters.
strict: Whether to enable strict schema adherence when generating the function call.
"""
name: str
description: str = ""
parameters: dict[str, Any] = {}
strict: bool = True
class ClickAction(BaseModel):
"""Click at specific coordinates on the screen."""
model_config = ConfigDict(
extra="forbid",
json_schema_serialization_defaults_required=True,
json_schema_mode_override="serialization",
use_attribute_docstrings=True,
)
action: Literal["click"] = "click"
x: int
"""The x coordinate, number of pixels from the left edge."""
y: int
"""The y coordinate, number of pixels from the top edge."""
function_definition = FunctionDefinition(
name="click_action",
description=ClickAction.__doc__ or "",
parameters=ClickAction.model_json_schema(),
strict=True,
)
def get_localization_prompt_structured_output(image, instruction: str) -> list[dict[str, Any]]:
guidelines: str = "Localize an element on the GUI image according to my instructions and output a click position. You must output a valid JSON format."
return [
{
"role": "system",
"content": json.dumps([function_definition.model_dump()]),
},
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": f"{guidelines}
{instruction}"},
],
},
]
messages = get_localization_prompt_structured_output(image, instruction)
coordinates_str = run_inference(messages)[0]
coordinates = ClickAction.model_validate(json.loads(coordinates_str)["arguments"])
print(coordinates)
# Expected ClickAction(action='click', x=352, y=340)Deploy This Model
Production-ready deployment in minutes
Together.ai
Instant API access to this model
Production-ready inference API. Start free, scale to millions.
Try Free APIReplicate
One-click model deployment
Run models in the cloud with simple API. No DevOps required.
Deploy NowDisclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.