GUI-Actor-Verifier-2B

Name: GUI-Actor-Verifier-2B
Author: microsoft

license:mit

microsoft

Image Model

OTHER

2B params

New

98 downloads

Early-stage

Try on Hugging Face Add to Compare

Edge AI:

Mobile

Laptop

Server

5GB+ RAM

Mobile

Laptop

Server

Quick Summary

AI model with specialized capabilities.

Device Compatibility

Mobile

4-6GB RAM

Laptop

16GB RAM

Server

GPU

Minimum Recommended

2GB+ RAM

Code Examples

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

load modelpythontransformers

import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from transformers.generation import GenerationConfig
import json
import re
import os
import numpy as np
from PIL import Image, ImageDraw
from qwen_vl_utils import process_vision_info



# load model
model_name_or_path = "microsoft/GUI-Actor-Verifier-2B"
model = Qwen2VLForConditionalGeneration.from_pretrained(
            model_name_or_path, 
            device_map="cuda:0", 
            trust_remote_code=True, 
            torch_dtype=torch.bfloat16,
            attn_implementation="flash_attention_2"
        ).eval()
output_len = 1

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(model_name_or_path)

def draw_annotations(img, point_in_pixel, bbox, output_path='test.png', color='red', size=1):
    draw = ImageDraw.Draw(img)
    
    # Draw the ground truth bounding box in green
    if bbox:
        # Assuming bbox format is [x1, y1, x2, y2]
        draw.rectangle(bbox, outline="yellow", width=4)
    
    # Draw a small circle around the predicted point in red
    if point_in_pixel:
        # Create a small rectangle around the point (5 pixels in each direction)
        radius = np.ceil(8 * size).astype(int)
        circle_bbox = [
            point_in_pixel[0] - radius,  # x1
            point_in_pixel[1] - radius,  # y1
            point_in_pixel[0] + radius,  # x2
            point_in_pixel[1] + radius   # y2
        ]
        draw.ellipse(circle_bbox, outline=color, width=np.ceil(4 * size).astype(int))
    
    return img

def ground_only_positive(model, tokenizer, processor, instruction, image, point):
  if isinstance(image, str):
      image_path = image
      image = Image.open(image_path)
  else:
      image_path = image_to_temp_filename(image)
  assert os.path.exists(image_path) and os.path.isfile(image_path), "Invalid input image path."

  width, height = image.size
  image = draw_annotations(image, point, None, output_path=None, size=height/1000 * 1.2)

  prompt_origin = "Please observe the screenshot and exame whether the hollow red circle accurately placed on the intended position in the image: '{}'. Answer True or False."
  full_prompt = prompt_origin.format(instruction)

  messages = [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "image": image,
              },
              {"type": "text", "text": full_prompt},
          ],
      }
  ]
  # Preparation for inference
  text_input = processor.apply_chat_template(
      messages, tokenize=False, add_generation_prompt=True
  )
  image_inputs, video_inputs = process_vision_info(messages)
  inputs = processor(
      text=[text_input],
      images=image_inputs,
      videos=video_inputs,
      padding=True,
      return_tensors="pt",
  )
  inputs = inputs.to("cuda:0")

  generated_ids = model.generate(
      **inputs,  
      max_new_tokens=output_len,
      do_sample=False,
      temperature=0.0
  )

  generated_ids_trimmed = [
      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
  ]
  response = processor.batch_decode(
      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
  )[0]

  print(response)
  matches = re.findall(r'\b(?:True|False)\b', response)
  if not len(matches):
      answer = 'Error Format'
  else:
      answer = matches[-1]
  return answer

# given the image path and instruction and coorindate
instruction = 'close this window'
image = Image.open('test.png')
width, height = image.size
point = [int(0.9709 * width), int(0.1548, * height)] # The point should be in pixels
answer = ground_only_positive(model, tokenizer, processor, instruction, image, point) # output True or False

Deploy This Model

Production-ready deployment in minutes

Together.ai

Instant API access to this model

Fastest API

Production-ready inference API. Start free, scale to millions.

Try Free API

Replicate

One-click model deployment

Easiest Setup

Run models in the cloud with simple API. No DevOps required.

Deploy Now

Disclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.