GUI-Actor-Verifier-2B
98
12
license:mit
by
microsoft
Image Model
OTHER
2B params
New
98 downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
5GB+ RAM
Mobile
Laptop
Server
Quick Summary
AI model with specialized capabilities.
Device Compatibility
Mobile
4-6GB RAM
Laptop
16GB RAM
Server
GPU
Minimum Recommended
2GB+ RAM
Code Examples
load modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or Falseload modelpythontransformers
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info
# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name_or_path,
device_map="cuda:0",
trust_remote_code=True,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2"
).eval()
output_len = 1
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)
def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
draw = ImageDraw.Draw(img)
# Draw the ground truth bounding box in green
if bbox:
# Assuming bbox format is [x1, y1, x2, y2]
draw.rectangle(bbox, outline="yellow", width=4)
# Draw a small circle around the predicted point in red
if point_in_pixel:
# Create a small rectangle around the point (5 pixels in each direction)
radius = np.ceil(8 * size).astype(int)
circle_bbox = [
point_in_pixel[0] - radius, # x1
point_in_pixel[1] - radius, # y1
point_in_pixel[0] + radius, # x2
point_in_pixel[1] + radius # y2
]
draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
return img
def ground_only_positive(model, tokenizer, processor, instruction, image, point):
if isinstance(image, str):
image_path = image
image = Image.open(image_path)
else:
image_path = image_to_temp_filename(image)
assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."
width, height = image.size
image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)
prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
full_prompt = prompt_origin.format(instruction)
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": image,
},
{"type": "text", "text": full_prompt},
],
}
]
# Preparation for inference
text_input = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda:0")
generated_ids = model.generate(
**inputs,
max_new_tokens=output_len,
do_sample=False,
temperature=0.0
)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
response = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
print(response)
matches = re.findall(r'\b(?:True|False)\b', response)
if not len(matches):
answer = 'Error Format'
else:
answer = matches[-1]
return answer
# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or FalseDeploy This Model
Production-ready deployment in minutes
Together.ai
Instant API access to this model
Production-ready inference API. Start free, scale to millions.
Try Free APIReplicate
One-click model deployment
Run models in the cloud with simple API. No DevOps required.
Deploy NowDisclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.