kosmos-2.5-ft

21
2
1 language
license:apache-2.0
by
merve
Image Model
OTHER
2.5B params
New
21 downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
6GB+ RAM
Mobile
Laptop
Server
Quick Summary

Kosmos-2.5 fine-tuned on grounded OCR (OCR with bounding boxes), find the script here: (GH, HF)

Device Compatibility

Mobile
4-6GB RAM
Laptop
16GB RAM
Server
GPU
Minimum Recommended
3GB+ RAM

Code Examples

pythontransformers
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
import torch 

model = Kosmos2_5ForConditionalGeneration.from_pretrained("merve/kosmos-2.5-ft", device_map="cuda", dtype=torch.bfloat16)
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2.5")

import requests
from PIL import Image
url = "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/ufo-sighting.jpg"
image = Image.open(requests.get(url, stream=True).raw)


import re
prompt = "<ocr>"
inputs = processor(text=prompt, images=image, return_tensors="pt")
height, width = inputs.pop("height"), inputs.pop("width")
raw_width, raw_height = image.size
scale_height = raw_height / height
scale_width = raw_width / width

inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
inputs["flattened_patches"] = inputs["flattened_patches"].to(torch.bfloat16)

generated_ids = model.generate(
    **inputs,
    max_new_tokens=2000,
)

generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)

import re
from PIL import ImageDraw

def post_process(y, scale_height, scale_width):

    pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
    bboxes_raw = re.findall(pattern, y)
    lines = re.split(pattern, y)[1:]  
    bboxes = [list(map(int, re.findall(r"\d+", bb))) for bb in bboxes_raw]

    out_lines = []
    for i, box in enumerate(bboxes):
        if len(box) != 4:
            continue
        x0, y0, x1, y1 = box

        if x0 >= x1 or y0 >= y1:
            continue

        sx0 = int(x0 * scale_width)
        sy0 = int(y0 * scale_height)
        sx1 = int(x1 * scale_width)
        sy1 = int(y1 * scale_height)

        label = lines[i] if i < len(lines) else ""
        label = label.lstrip(", ").strip()

        out_lines.append(f"{sx0},{sy0},{sx1},{sy0},{sx1},{sy1},{sx0},{sy1},{label}")

    return "\n".join(out_lines)


output_text = post_process(generated_text[0], scale_height, scale_width)
print(output_text)  

draw = ImageDraw.Draw(image)

for line in output_text.strip().splitlines():
    coords = re.findall(r"-?\d+", line)[:8]
    if len(coords) < 8:
        continue
    xy = list(map(int, coords))
    draw.polygon(xy, outline="red")

image.save("output.png")
pythontransformers
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
import torch 

model = Kosmos2_5ForConditionalGeneration.from_pretrained("merve/kosmos-2.5-ft", device_map="cuda", dtype=torch.bfloat16)
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2.5")

import requests
from PIL import Image
url = "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/ufo-sighting.jpg"
image = Image.open(requests.get(url, stream=True).raw)


import re
prompt = "<ocr>"
inputs = processor(text=prompt, images=image, return_tensors="pt")
height, width = inputs.pop("height"), inputs.pop("width")
raw_width, raw_height = image.size
scale_height = raw_height / height
scale_width = raw_width / width

inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
inputs["flattened_patches"] = inputs["flattened_patches"].to(torch.bfloat16)

generated_ids = model.generate(
    **inputs,
    max_new_tokens=2000,
)

generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)

import re
from PIL import ImageDraw

def post_process(y, scale_height, scale_width):

    pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
    bboxes_raw = re.findall(pattern, y)
    lines = re.split(pattern, y)[1:]  
    bboxes = [list(map(int, re.findall(r"\d+", bb))) for bb in bboxes_raw]

    out_lines = []
    for i, box in enumerate(bboxes):
        if len(box) != 4:
            continue
        x0, y0, x1, y1 = box

        if x0 >= x1 or y0 >= y1:
            continue

        sx0 = int(x0 * scale_width)
        sy0 = int(y0 * scale_height)
        sx1 = int(x1 * scale_width)
        sy1 = int(y1 * scale_height)

        label = lines[i] if i < len(lines) else ""
        label = label.lstrip(", ").strip()

        out_lines.append(f"{sx0},{sy0},{sx1},{sy0},{sx1},{sy1},{sx0},{sy1},{label}")

    return "\n".join(out_lines)


output_text = post_process(generated_text[0], scale_height, scale_width)
print(output_text)  

draw = ImageDraw.Draw(image)

for line in output_text.strip().splitlines():
    coords = re.findall(r"-?\d+", line)[:8]
    if len(coords) < 8:
        continue
    xy = list(map(int, coords))
    draw.polygon(xy, outline="red")

image.save("output.png")
pythontransformers
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
import torch 

model = Kosmos2_5ForConditionalGeneration.from_pretrained("merve/kosmos-2.5-ft", device_map="cuda", dtype=torch.bfloat16)
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2.5")

import requests
from PIL import Image
url = "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/ufo-sighting.jpg"
image = Image.open(requests.get(url, stream=True).raw)


import re
prompt = "<ocr>"
inputs = processor(text=prompt, images=image, return_tensors="pt")
height, width = inputs.pop("height"), inputs.pop("width")
raw_width, raw_height = image.size
scale_height = raw_height / height
scale_width = raw_width / width

inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
inputs["flattened_patches"] = inputs["flattened_patches"].to(torch.bfloat16)

generated_ids = model.generate(
    **inputs,
    max_new_tokens=2000,
)

generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)

import re
from PIL import ImageDraw

def post_process(y, scale_height, scale_width):

    pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
    bboxes_raw = re.findall(pattern, y)
    lines = re.split(pattern, y)[1:]  
    bboxes = [list(map(int, re.findall(r"\d+", bb))) for bb in bboxes_raw]

    out_lines = []
    for i, box in enumerate(bboxes):
        if len(box) != 4:
            continue
        x0, y0, x1, y1 = box

        if x0 >= x1 or y0 >= y1:
            continue

        sx0 = int(x0 * scale_width)
        sy0 = int(y0 * scale_height)
        sx1 = int(x1 * scale_width)
        sy1 = int(y1 * scale_height)

        label = lines[i] if i < len(lines) else ""
        label = label.lstrip(", ").strip()

        out_lines.append(f"{sx0},{sy0},{sx1},{sy0},{sx1},{sy1},{sx0},{sy1},{label}")

    return "\n".join(out_lines)


output_text = post_process(generated_text[0], scale_height, scale_width)
print(output_text)  

draw = ImageDraw.Draw(image)

for line in output_text.strip().splitlines():
    coords = re.findall(r"-?\d+", line)[:8]
    if len(coords) < 8:
        continue
    xy = list(map(int, coords))
    draw.polygon(xy, outline="red")

image.save("output.png")
pythontransformers
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
import torch 

model = Kosmos2_5ForConditionalGeneration.from_pretrained("merve/kosmos-2.5-ft", device_map="cuda", dtype=torch.bfloat16)
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2.5")

import requests
from PIL import Image
url = "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/ufo-sighting.jpg"
image = Image.open(requests.get(url, stream=True).raw)


import re
prompt = "<ocr>"
inputs = processor(text=prompt, images=image, return_tensors="pt")
height, width = inputs.pop("height"), inputs.pop("width")
raw_width, raw_height = image.size
scale_height = raw_height / height
scale_width = raw_width / width

inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
inputs["flattened_patches"] = inputs["flattened_patches"].to(torch.bfloat16)

generated_ids = model.generate(
    **inputs,
    max_new_tokens=2000,
)

generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)

import re
from PIL import ImageDraw

def post_process(y, scale_height, scale_width):

    pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
    bboxes_raw = re.findall(pattern, y)
    lines = re.split(pattern, y)[1:]  
    bboxes = [list(map(int, re.findall(r"\d+", bb))) for bb in bboxes_raw]

    out_lines = []
    for i, box in enumerate(bboxes):
        if len(box) != 4:
            continue
        x0, y0, x1, y1 = box

        if x0 >= x1 or y0 >= y1:
            continue

        sx0 = int(x0 * scale_width)
        sy0 = int(y0 * scale_height)
        sx1 = int(x1 * scale_width)
        sy1 = int(y1 * scale_height)

        label = lines[i] if i < len(lines) else ""
        label = label.lstrip(", ").strip()

        out_lines.append(f"{sx0},{sy0},{sx1},{sy0},{sx1},{sy1},{sx0},{sy1},{label}")

    return "\n".join(out_lines)


output_text = post_process(generated_text[0], scale_height, scale_width)
print(output_text)  

draw = ImageDraw.Draw(image)

for line in output_text.strip().splitlines():
    coords = re.findall(r"-?\d+", line)[:8]
    if len(coords) < 8:
        continue
    xy = list(map(int, coords))
    draw.polygon(xy, outline="red")

image.save("output.png")
pythontransformers
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
import torch 

model = Kosmos2_5ForConditionalGeneration.from_pretrained("merve/kosmos-2.5-ft", device_map="cuda", dtype=torch.bfloat16)
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2.5")

import requests
from PIL import Image
url = "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/ufo-sighting.jpg"
image = Image.open(requests.get(url, stream=True).raw)


import re
prompt = "<ocr>"
inputs = processor(text=prompt, images=image, return_tensors="pt")
height, width = inputs.pop("height"), inputs.pop("width")
raw_width, raw_height = image.size
scale_height = raw_height / height
scale_width = raw_width / width

inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
inputs["flattened_patches"] = inputs["flattened_patches"].to(torch.bfloat16)

generated_ids = model.generate(
    **inputs,
    max_new_tokens=2000,
)

generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)

import re
from PIL import ImageDraw

def post_process(y, scale_height, scale_width):

    pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
    bboxes_raw = re.findall(pattern, y)
    lines = re.split(pattern, y)[1:]  
    bboxes = [list(map(int, re.findall(r"\d+", bb))) for bb in bboxes_raw]

    out_lines = []
    for i, box in enumerate(bboxes):
        if len(box) != 4:
            continue
        x0, y0, x1, y1 = box

        if x0 >= x1 or y0 >= y1:
            continue

        sx0 = int(x0 * scale_width)
        sy0 = int(y0 * scale_height)
        sx1 = int(x1 * scale_width)
        sy1 = int(y1 * scale_height)

        label = lines[i] if i < len(lines) else ""
        label = label.lstrip(", ").strip()

        out_lines.append(f"{sx0},{sy0},{sx1},{sy0},{sx1},{sy1},{sx0},{sy1},{label}")

    return "\n".join(out_lines)


output_text = post_process(generated_text[0], scale_height, scale_width)
print(output_text)  

draw = ImageDraw.Draw(image)

for line in output_text.strip().splitlines():
    coords = re.findall(r"-?\d+", line)[:8]
    if len(coords) < 8:
        continue
    xy = list(map(int, coords))
    draw.polygon(xy, outline="red")

image.save("output.png")
pythontransformers
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
import torch 

model = Kosmos2_5ForConditionalGeneration.from_pretrained("merve/kosmos-2.5-ft", device_map="cuda", dtype=torch.bfloat16)
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2.5")

import requests
from PIL import Image
url = "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/ufo-sighting.jpg"
image = Image.open(requests.get(url, stream=True).raw)


import re
prompt = "<ocr>"
inputs = processor(text=prompt, images=image, return_tensors="pt")
height, width = inputs.pop("height"), inputs.pop("width")
raw_width, raw_height = image.size
scale_height = raw_height / height
scale_width = raw_width / width

inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
inputs["flattened_patches"] = inputs["flattened_patches"].to(torch.bfloat16)

generated_ids = model.generate(
    **inputs,
    max_new_tokens=2000,
)

generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)

import re
from PIL import ImageDraw

def post_process(y, scale_height, scale_width):

    pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
    bboxes_raw = re.findall(pattern, y)
    lines = re.split(pattern, y)[1:]  
    bboxes = [list(map(int, re.findall(r"\d+", bb))) for bb in bboxes_raw]

    out_lines = []
    for i, box in enumerate(bboxes):
        if len(box) != 4:
            continue
        x0, y0, x1, y1 = box

        if x0 >= x1 or y0 >= y1:
            continue

        sx0 = int(x0 * scale_width)
        sy0 = int(y0 * scale_height)
        sx1 = int(x1 * scale_width)
        sy1 = int(y1 * scale_height)

        label = lines[i] if i < len(lines) else ""
        label = label.lstrip(", ").strip()

        out_lines.append(f"{sx0},{sy0},{sx1},{sy0},{sx1},{sy1},{sx0},{sy1},{label}")

    return "\n".join(out_lines)


output_text = post_process(generated_text[0], scale_height, scale_width)
print(output_text)  

draw = ImageDraw.Draw(image)

for line in output_text.strip().splitlines():
    coords = re.findall(r"-?\d+", line)[:8]
    if len(coords) < 8:
        continue
    xy = list(map(int, coords))
    draw.polygon(xy, outline="red")

image.save("output.png")
pythontransformers
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
import torch 

model = Kosmos2_5ForConditionalGeneration.from_pretrained("merve/kosmos-2.5-ft", device_map="cuda", dtype=torch.bfloat16)
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2.5")

import requests
from PIL import Image
url = "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/ufo-sighting.jpg"
image = Image.open(requests.get(url, stream=True).raw)


import re
prompt = "<ocr>"
inputs = processor(text=prompt, images=image, return_tensors="pt")
height, width = inputs.pop("height"), inputs.pop("width")
raw_width, raw_height = image.size
scale_height = raw_height / height
scale_width = raw_width / width

inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
inputs["flattened_patches"] = inputs["flattened_patches"].to(torch.bfloat16)

generated_ids = model.generate(
    **inputs,
    max_new_tokens=2000,
)

generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)

import re
from PIL import ImageDraw

def post_process(y, scale_height, scale_width):

    pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
    bboxes_raw = re.findall(pattern, y)
    lines = re.split(pattern, y)[1:]  
    bboxes = [list(map(int, re.findall(r"\d+", bb))) for bb in bboxes_raw]

    out_lines = []
    for i, box in enumerate(bboxes):
        if len(box) != 4:
            continue
        x0, y0, x1, y1 = box

        if x0 >= x1 or y0 >= y1:
            continue

        sx0 = int(x0 * scale_width)
        sy0 = int(y0 * scale_height)
        sx1 = int(x1 * scale_width)
        sy1 = int(y1 * scale_height)

        label = lines[i] if i < len(lines) else ""
        label = label.lstrip(", ").strip()

        out_lines.append(f"{sx0},{sy0},{sx1},{sy0},{sx1},{sy1},{sx0},{sy1},{label}")

    return "\n".join(out_lines)


output_text = post_process(generated_text[0], scale_height, scale_width)
print(output_text)  

draw = ImageDraw.Draw(image)

for line in output_text.strip().splitlines():
    coords = re.findall(r"-?\d+", line)[:8]
    if len(coords) < 8:
        continue
    xy = list(map(int, coords))
    draw.polygon(xy, outline="red")

image.save("output.png")
pythontransformers
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
import torch 

model = Kosmos2_5ForConditionalGeneration.from_pretrained("merve/kosmos-2.5-ft", device_map="cuda", dtype=torch.bfloat16)
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2.5")

import requests
from PIL import Image
url = "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/ufo-sighting.jpg"
image = Image.open(requests.get(url, stream=True).raw)


import re
prompt = "<ocr>"
inputs = processor(text=prompt, images=image, return_tensors="pt")
height, width = inputs.pop("height"), inputs.pop("width")
raw_width, raw_height = image.size
scale_height = raw_height / height
scale_width = raw_width / width

inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
inputs["flattened_patches"] = inputs["flattened_patches"].to(torch.bfloat16)

generated_ids = model.generate(
    **inputs,
    max_new_tokens=2000,
)

generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)

import re
from PIL import ImageDraw

def post_process(y, scale_height, scale_width):

    pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
    bboxes_raw = re.findall(pattern, y)
    lines = re.split(pattern, y)[1:]  
    bboxes = [list(map(int, re.findall(r"\d+", bb))) for bb in bboxes_raw]

    out_lines = []
    for i, box in enumerate(bboxes):
        if len(box) != 4:
            continue
        x0, y0, x1, y1 = box

        if x0 >= x1 or y0 >= y1:
            continue

        sx0 = int(x0 * scale_width)
        sy0 = int(y0 * scale_height)
        sx1 = int(x1 * scale_width)
        sy1 = int(y1 * scale_height)

        label = lines[i] if i < len(lines) else ""
        label = label.lstrip(", ").strip()

        out_lines.append(f"{sx0},{sy0},{sx1},{sy0},{sx1},{sy1},{sx0},{sy1},{label}")

    return "\n".join(out_lines)


output_text = post_process(generated_text[0], scale_height, scale_width)
print(output_text)  

draw = ImageDraw.Draw(image)

for line in output_text.strip().splitlines():
    coords = re.findall(r"-?\d+", line)[:8]
    if len(coords) < 8:
        continue
    xy = list(map(int, coords))
    draw.polygon(xy, outline="red")

image.save("output.png")
pythontransformers
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
import torch 

model = Kosmos2_5ForConditionalGeneration.from_pretrained("merve/kosmos-2.5-ft", device_map="cuda", dtype=torch.bfloat16)
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2.5")

import requests
from PIL import Image
url = "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/ufo-sighting.jpg"
image = Image.open(requests.get(url, stream=True).raw)


import re
prompt = "<ocr>"
inputs = processor(text=prompt, images=image, return_tensors="pt")
height, width = inputs.pop("height"), inputs.pop("width")
raw_width, raw_height = image.size
scale_height = raw_height / height
scale_width = raw_width / width

inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
inputs["flattened_patches"] = inputs["flattened_patches"].to(torch.bfloat16)

generated_ids = model.generate(
    **inputs,
    max_new_tokens=2000,
)

generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)

import re
from PIL import ImageDraw

def post_process(y, scale_height, scale_width):

    pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
    bboxes_raw = re.findall(pattern, y)
    lines = re.split(pattern, y)[1:]  
    bboxes = [list(map(int, re.findall(r"\d+", bb))) for bb in bboxes_raw]

    out_lines = []
    for i, box in enumerate(bboxes):
        if len(box) != 4:
            continue
        x0, y0, x1, y1 = box

        if x0 >= x1 or y0 >= y1:
            continue

        sx0 = int(x0 * scale_width)
        sy0 = int(y0 * scale_height)
        sx1 = int(x1 * scale_width)
        sy1 = int(y1 * scale_height)

        label = lines[i] if i < len(lines) else ""
        label = label.lstrip(", ").strip()

        out_lines.append(f"{sx0},{sy0},{sx1},{sy0},{sx1},{sy1},{sx0},{sy1},{label}")

    return "\n".join(out_lines)


output_text = post_process(generated_text[0], scale_height, scale_width)
print(output_text)  

draw = ImageDraw.Draw(image)

for line in output_text.strip().splitlines():
    coords = re.findall(r"-?\d+", line)[:8]
    if len(coords) < 8:
        continue
    xy = list(map(int, coords))
    draw.polygon(xy, outline="red")

image.save("output.png")
pythontransformers
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
import torch 

model = Kosmos2_5ForConditionalGeneration.from_pretrained("merve/kosmos-2.5-ft", device_map="cuda", dtype=torch.bfloat16)
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2.5")

import requests
from PIL import Image
url = "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/ufo-sighting.jpg"
image = Image.open(requests.get(url, stream=True).raw)


import re
prompt = "<ocr>"
inputs = processor(text=prompt, images=image, return_tensors="pt")
height, width = inputs.pop("height"), inputs.pop("width")
raw_width, raw_height = image.size
scale_height = raw_height / height
scale_width = raw_width / width

inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
inputs["flattened_patches"] = inputs["flattened_patches"].to(torch.bfloat16)

generated_ids = model.generate(
    **inputs,
    max_new_tokens=2000,
)

generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)

import re
from PIL import ImageDraw

def post_process(y, scale_height, scale_width):

    pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
    bboxes_raw = re.findall(pattern, y)
    lines = re.split(pattern, y)[1:]  
    bboxes = [list(map(int, re.findall(r"\d+", bb))) for bb in bboxes_raw]

    out_lines = []
    for i, box in enumerate(bboxes):
        if len(box) != 4:
            continue
        x0, y0, x1, y1 = box

        if x0 >= x1 or y0 >= y1:
            continue

        sx0 = int(x0 * scale_width)
        sy0 = int(y0 * scale_height)
        sx1 = int(x1 * scale_width)
        sy1 = int(y1 * scale_height)

        label = lines[i] if i < len(lines) else ""
        label = label.lstrip(", ").strip()

        out_lines.append(f"{sx0},{sy0},{sx1},{sy0},{sx1},{sy1},{sx0},{sy1},{label}")

    return "\n".join(out_lines)


output_text = post_process(generated_text[0], scale_height, scale_width)
print(output_text)  

draw = ImageDraw.Draw(image)

for line in output_text.strip().splitlines():
    coords = re.findall(r"-?\d+", line)[:8]
    if len(coords) < 8:
        continue
    xy = list(map(int, coords))
    draw.polygon(xy, outline="red")

image.save("output.png")
pythontransformers
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
import torch 

model = Kosmos2_5ForConditionalGeneration.from_pretrained("merve/kosmos-2.5-ft", device_map="cuda", dtype=torch.bfloat16)
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2.5")

import requests
from PIL import Image
url = "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/ufo-sighting.jpg"
image = Image.open(requests.get(url, stream=True).raw)


import re
prompt = "<ocr>"
inputs = processor(text=prompt, images=image, return_tensors="pt")
height, width = inputs.pop("height"), inputs.pop("width")
raw_width, raw_height = image.size
scale_height = raw_height / height
scale_width = raw_width / width

inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
inputs["flattened_patches"] = inputs["flattened_patches"].to(torch.bfloat16)

generated_ids = model.generate(
    **inputs,
    max_new_tokens=2000,
)

generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)

import re
from PIL import ImageDraw

def post_process(y, scale_height, scale_width):

    pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
    bboxes_raw = re.findall(pattern, y)
    lines = re.split(pattern, y)[1:]  
    bboxes = [list(map(int, re.findall(r"\d+", bb))) for bb in bboxes_raw]

    out_lines = []
    for i, box in enumerate(bboxes):
        if len(box) != 4:
            continue
        x0, y0, x1, y1 = box

        if x0 >= x1 or y0 >= y1:
            continue

        sx0 = int(x0 * scale_width)
        sy0 = int(y0 * scale_height)
        sx1 = int(x1 * scale_width)
        sy1 = int(y1 * scale_height)

        label = lines[i] if i < len(lines) else ""
        label = label.lstrip(", ").strip()

        out_lines.append(f"{sx0},{sy0},{sx1},{sy0},{sx1},{sy1},{sx0},{sy1},{label}")

    return "\n".join(out_lines)


output_text = post_process(generated_text[0], scale_height, scale_width)
print(output_text)  

draw = ImageDraw.Draw(image)

for line in output_text.strip().splitlines():
    coords = re.findall(r"-?\d+", line)[:8]
    if len(coords) < 8:
        continue
    xy = list(map(int, coords))
    draw.polygon(xy, outline="red")

image.save("output.png")
pythontransformers
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
import torch 

model = Kosmos2_5ForConditionalGeneration.from_pretrained("merve/kosmos-2.5-ft", device_map="cuda", dtype=torch.bfloat16)
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2.5")

import requests
from PIL import Image
url = "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/ufo-sighting.jpg"
image = Image.open(requests.get(url, stream=True).raw)


import re
prompt = "<ocr>"
inputs = processor(text=prompt, images=image, return_tensors="pt")
height, width = inputs.pop("height"), inputs.pop("width")
raw_width, raw_height = image.size
scale_height = raw_height / height
scale_width = raw_width / width

inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
inputs["flattened_patches"] = inputs["flattened_patches"].to(torch.bfloat16)

generated_ids = model.generate(
    **inputs,
    max_new_tokens=2000,
)

generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)

import re
from PIL import ImageDraw

def post_process(y, scale_height, scale_width):

    pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
    bboxes_raw = re.findall(pattern, y)
    lines = re.split(pattern, y)[1:]  
    bboxes = [list(map(int, re.findall(r"\d+", bb))) for bb in bboxes_raw]

    out_lines = []
    for i, box in enumerate(bboxes):
        if len(box) != 4:
            continue
        x0, y0, x1, y1 = box

        if x0 >= x1 or y0 >= y1:
            continue

        sx0 = int(x0 * scale_width)
        sy0 = int(y0 * scale_height)
        sx1 = int(x1 * scale_width)
        sy1 = int(y1 * scale_height)

        label = lines[i] if i < len(lines) else ""
        label = label.lstrip(", ").strip()

        out_lines.append(f"{sx0},{sy0},{sx1},{sy0},{sx1},{sy1},{sx0},{sy1},{label}")

    return "\n".join(out_lines)


output_text = post_process(generated_text[0], scale_height, scale_width)
print(output_text)  

draw = ImageDraw.Draw(image)

for line in output_text.strip().splitlines():
    coords = re.findall(r"-?\d+", line)[:8]
    if len(coords) < 8:
        continue
    xy = list(map(int, coords))
    draw.polygon(xy, outline="red")

image.save("output.png")
pythontransformers
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
import torch 

model = Kosmos2_5ForConditionalGeneration.from_pretrained("merve/kosmos-2.5-ft", device_map="cuda", dtype=torch.bfloat16)
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2.5")

import requests
from PIL import Image
url = "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/ufo-sighting.jpg"
image = Image.open(requests.get(url, stream=True).raw)


import re
prompt = "<ocr>"
inputs = processor(text=prompt, images=image, return_tensors="pt")
height, width = inputs.pop("height"), inputs.pop("width")
raw_width, raw_height = image.size
scale_height = raw_height / height
scale_width = raw_width / width

inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
inputs["flattened_patches"] = inputs["flattened_patches"].to(torch.bfloat16)

generated_ids = model.generate(
    **inputs,
    max_new_tokens=2000,
)

generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)

import re
from PIL import ImageDraw

def post_process(y, scale_height, scale_width):

    pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
    bboxes_raw = re.findall(pattern, y)
    lines = re.split(pattern, y)[1:]  
    bboxes = [list(map(int, re.findall(r"\d+", bb))) for bb in bboxes_raw]

    out_lines = []
    for i, box in enumerate(bboxes):
        if len(box) != 4:
            continue
        x0, y0, x1, y1 = box

        if x0 >= x1 or y0 >= y1:
            continue

        sx0 = int(x0 * scale_width)
        sy0 = int(y0 * scale_height)
        sx1 = int(x1 * scale_width)
        sy1 = int(y1 * scale_height)

        label = lines[i] if i < len(lines) else ""
        label = label.lstrip(", ").strip()

        out_lines.append(f"{sx0},{sy0},{sx1},{sy0},{sx1},{sy1},{sx0},{sy1},{label}")

    return "\n".join(out_lines)


output_text = post_process(generated_text[0], scale_height, scale_width)
print(output_text)  

draw = ImageDraw.Draw(image)

for line in output_text.strip().splitlines():
    coords = re.findall(r"-?\d+", line)[:8]
    if len(coords) < 8:
        continue
    xy = list(map(int, coords))
    draw.polygon(xy, outline="red")

image.save("output.png")
pythontransformers
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
import torch 

model = Kosmos2_5ForConditionalGeneration.from_pretrained("merve/kosmos-2.5-ft", device_map="cuda", dtype=torch.bfloat16)
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2.5")

import requests
from PIL import Image
url = "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/ufo-sighting.jpg"
image = Image.open(requests.get(url, stream=True).raw)


import re
prompt = "<ocr>"
inputs = processor(text=prompt, images=image, return_tensors="pt")
height, width = inputs.pop("height"), inputs.pop("width")
raw_width, raw_height = image.size
scale_height = raw_height / height
scale_width = raw_width / width

inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
inputs["flattened_patches"] = inputs["flattened_patches"].to(torch.bfloat16)

generated_ids = model.generate(
    **inputs,
    max_new_tokens=2000,
)

generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)

import re
from PIL import ImageDraw

def post_process(y, scale_height, scale_width):

    pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
    bboxes_raw = re.findall(pattern, y)
    lines = re.split(pattern, y)[1:]  
    bboxes = [list(map(int, re.findall(r"\d+", bb))) for bb in bboxes_raw]

    out_lines = []
    for i, box in enumerate(bboxes):
        if len(box) != 4:
            continue
        x0, y0, x1, y1 = box

        if x0 >= x1 or y0 >= y1:
            continue

        sx0 = int(x0 * scale_width)
        sy0 = int(y0 * scale_height)
        sx1 = int(x1 * scale_width)
        sy1 = int(y1 * scale_height)

        label = lines[i] if i < len(lines) else ""
        label = label.lstrip(", ").strip()

        out_lines.append(f"{sx0},{sy0},{sx1},{sy0},{sx1},{sy1},{sx0},{sy1},{label}")

    return "\n".join(out_lines)


output_text = post_process(generated_text[0], scale_height, scale_width)
print(output_text)  

draw = ImageDraw.Draw(image)

for line in output_text.strip().splitlines():
    coords = re.findall(r"-?\d+", line)[:8]
    if len(coords) < 8:
        continue
    xy = list(map(int, coords))
    draw.polygon(xy, outline="red")

image.save("output.png")
pythontransformers
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
import torch 

model = Kosmos2_5ForConditionalGeneration.from_pretrained("merve/kosmos-2.5-ft", device_map="cuda", dtype=torch.bfloat16)
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2.5")

import requests
from PIL import Image
url = "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/ufo-sighting.jpg"
image = Image.open(requests.get(url, stream=True).raw)


import re
prompt = "<ocr>"
inputs = processor(text=prompt, images=image, return_tensors="pt")
height, width = inputs.pop("height"), inputs.pop("width")
raw_width, raw_height = image.size
scale_height = raw_height / height
scale_width = raw_width / width

inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
inputs["flattened_patches"] = inputs["flattened_patches"].to(torch.bfloat16)

generated_ids = model.generate(
    **inputs,
    max_new_tokens=2000,
)

generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)

import re
from PIL import ImageDraw

def post_process(y, scale_height, scale_width):

    pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
    bboxes_raw = re.findall(pattern, y)
    lines = re.split(pattern, y)[1:]  
    bboxes = [list(map(int, re.findall(r"\d+", bb))) for bb in bboxes_raw]

    out_lines = []
    for i, box in enumerate(bboxes):
        if len(box) != 4:
            continue
        x0, y0, x1, y1 = box

        if x0 >= x1 or y0 >= y1:
            continue

        sx0 = int(x0 * scale_width)
        sy0 = int(y0 * scale_height)
        sx1 = int(x1 * scale_width)
        sy1 = int(y1 * scale_height)

        label = lines[i] if i < len(lines) else ""
        label = label.lstrip(", ").strip()

        out_lines.append(f"{sx0},{sy0},{sx1},{sy0},{sx1},{sy1},{sx0},{sy1},{label}")

    return "\n".join(out_lines)


output_text = post_process(generated_text[0], scale_height, scale_width)
print(output_text)  

draw = ImageDraw.Draw(image)

for line in output_text.strip().splitlines():
    coords = re.findall(r"-?\d+", line)[:8]
    if len(coords) < 8:
        continue
    xy = list(map(int, coords))
    draw.polygon(xy, outline="red")

image.save("output.png")
pythontransformers
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
import torch 

model = Kosmos2_5ForConditionalGeneration.from_pretrained("merve/kosmos-2.5-ft", device_map="cuda", dtype=torch.bfloat16)
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2.5")

import requests
from PIL import Image
url = "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/ufo-sighting.jpg"
image = Image.open(requests.get(url, stream=True).raw)


import re
prompt = "<ocr>"
inputs = processor(text=prompt, images=image, return_tensors="pt")
height, width = inputs.pop("height"), inputs.pop("width")
raw_width, raw_height = image.size
scale_height = raw_height / height
scale_width = raw_width / width

inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
inputs["flattened_patches"] = inputs["flattened_patches"].to(torch.bfloat16)

generated_ids = model.generate(
    **inputs,
    max_new_tokens=2000,
)

generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)

import re
from PIL import ImageDraw

def post_process(y, scale_height, scale_width):

    pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
    bboxes_raw = re.findall(pattern, y)
    lines = re.split(pattern, y)[1:]  
    bboxes = [list(map(int, re.findall(r"\d+", bb))) for bb in bboxes_raw]

    out_lines = []
    for i, box in enumerate(bboxes):
        if len(box) != 4:
            continue
        x0, y0, x1, y1 = box

        if x0 >= x1 or y0 >= y1:
            continue

        sx0 = int(x0 * scale_width)
        sy0 = int(y0 * scale_height)
        sx1 = int(x1 * scale_width)
        sy1 = int(y1 * scale_height)

        label = lines[i] if i < len(lines) else ""
        label = label.lstrip(", ").strip()

        out_lines.append(f"{sx0},{sy0},{sx1},{sy0},{sx1},{sy1},{sx0},{sy1},{label}")

    return "\n".join(out_lines)


output_text = post_process(generated_text[0], scale_height, scale_width)
print(output_text)  

draw = ImageDraw.Draw(image)

for line in output_text.strip().splitlines():
    coords = re.findall(r"-?\d+", line)[:8]
    if len(coords) < 8:
        continue
    xy = list(map(int, coords))
    draw.polygon(xy, outline="red")

image.save("output.png")

Deploy This Model

Production-ready deployment in minutes

Together.ai

Instant API access to this model

Fastest API

Production-ready inference API. Start free, scale to millions.

Try Free API

Replicate

One-click model deployment

Easiest Setup

Run models in the cloud with simple API. No DevOps required.

Deploy Now

Disclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.