kosmos-2.5-ft
21
2
1 language
license:apache-2.0
by
merve
Image Model
OTHER
2.5B params
New
21 downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
6GB+ RAM
Mobile
Laptop
Server
Quick Summary
Kosmos-2.5 fine-tuned on grounded OCR (OCR with bounding boxes), find the script here: (GH, HF)
Device Compatibility
Mobile
4-6GB RAM
Laptop
16GB RAM
Server
GPU
Minimum Recommended
3GB+ RAM
Code Examples
pythontransformers
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
import torch
model = Kosmos2_5ForConditionalGeneration.from_pretrained("merve/kosmos-2.5-ft", device_map="cuda", dtype=torch.bfloat16)
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2.5")
import requests
from PIL import Image
url = "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/ufo-sighting.jpg"
image = Image.open(requests.get(url, stream=True).raw)
import re
prompt = "<ocr>"
inputs = processor(text=prompt, images=image, return_tensors="pt")
height, width = inputs.pop("height"), inputs.pop("width")
raw_width, raw_height = image.size
scale_height = raw_height / height
scale_width = raw_width / width
inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
inputs["flattened_patches"] = inputs["flattened_patches"].to(torch.bfloat16)
generated_ids = model.generate(
**inputs,
max_new_tokens=2000,
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
import re
from PIL import ImageDraw
def post_process(y, scale_height, scale_width):
pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
bboxes_raw = re.findall(pattern, y)
lines = re.split(pattern, y)[1:]
bboxes = [list(map(int, re.findall(r"\d+", bb))) for bb in bboxes_raw]
out_lines = []
for i, box in enumerate(bboxes):
if len(box) != 4:
continue
x0, y0, x1, y1 = box
if x0 >= x1 or y0 >= y1:
continue
sx0 = int(x0 * scale_width)
sy0 = int(y0 * scale_height)
sx1 = int(x1 * scale_width)
sy1 = int(y1 * scale_height)
label = lines[i] if i < len(lines) else ""
label = label.lstrip(", ").strip()
out_lines.append(f"{sx0},{sy0},{sx1},{sy0},{sx1},{sy1},{sx0},{sy1},{label}")
return "\n".join(out_lines)
output_text = post_process(generated_text[0], scale_height, scale_width)
print(output_text)
draw = ImageDraw.Draw(image)
for line in output_text.strip().splitlines():
coords = re.findall(r"-?\d+", line)[:8]
if len(coords) < 8:
continue
xy = list(map(int, coords))
draw.polygon(xy, outline="red")
image.save("output.png")pythontransformers
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
import torch
model = Kosmos2_5ForConditionalGeneration.from_pretrained("merve/kosmos-2.5-ft", device_map="cuda", dtype=torch.bfloat16)
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2.5")
import requests
from PIL import Image
url = "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/ufo-sighting.jpg"
image = Image.open(requests.get(url, stream=True).raw)
import re
prompt = "<ocr>"
inputs = processor(text=prompt, images=image, return_tensors="pt")
height, width = inputs.pop("height"), inputs.pop("width")
raw_width, raw_height = image.size
scale_height = raw_height / height
scale_width = raw_width / width
inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
inputs["flattened_patches"] = inputs["flattened_patches"].to(torch.bfloat16)
generated_ids = model.generate(
**inputs,
max_new_tokens=2000,
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
import re
from PIL import ImageDraw
def post_process(y, scale_height, scale_width):
pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
bboxes_raw = re.findall(pattern, y)
lines = re.split(pattern, y)[1:]
bboxes = [list(map(int, re.findall(r"\d+", bb))) for bb in bboxes_raw]
out_lines = []
for i, box in enumerate(bboxes):
if len(box) != 4:
continue
x0, y0, x1, y1 = box
if x0 >= x1 or y0 >= y1:
continue
sx0 = int(x0 * scale_width)
sy0 = int(y0 * scale_height)
sx1 = int(x1 * scale_width)
sy1 = int(y1 * scale_height)
label = lines[i] if i < len(lines) else ""
label = label.lstrip(", ").strip()
out_lines.append(f"{sx0},{sy0},{sx1},{sy0},{sx1},{sy1},{sx0},{sy1},{label}")
return "\n".join(out_lines)
output_text = post_process(generated_text[0], scale_height, scale_width)
print(output_text)
draw = ImageDraw.Draw(image)
for line in output_text.strip().splitlines():
coords = re.findall(r"-?\d+", line)[:8]
if len(coords) < 8:
continue
xy = list(map(int, coords))
draw.polygon(xy, outline="red")
image.save("output.png")pythontransformers
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
import torch
model = Kosmos2_5ForConditionalGeneration.from_pretrained("merve/kosmos-2.5-ft", device_map="cuda", dtype=torch.bfloat16)
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2.5")
import requests
from PIL import Image
url = "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/ufo-sighting.jpg"
image = Image.open(requests.get(url, stream=True).raw)
import re
prompt = "<ocr>"
inputs = processor(text=prompt, images=image, return_tensors="pt")
height, width = inputs.pop("height"), inputs.pop("width")
raw_width, raw_height = image.size
scale_height = raw_height / height
scale_width = raw_width / width
inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
inputs["flattened_patches"] = inputs["flattened_patches"].to(torch.bfloat16)
generated_ids = model.generate(
**inputs,
max_new_tokens=2000,
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
import re
from PIL import ImageDraw
def post_process(y, scale_height, scale_width):
pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
bboxes_raw = re.findall(pattern, y)
lines = re.split(pattern, y)[1:]
bboxes = [list(map(int, re.findall(r"\d+", bb))) for bb in bboxes_raw]
out_lines = []
for i, box in enumerate(bboxes):
if len(box) != 4:
continue
x0, y0, x1, y1 = box
if x0 >= x1 or y0 >= y1:
continue
sx0 = int(x0 * scale_width)
sy0 = int(y0 * scale_height)
sx1 = int(x1 * scale_width)
sy1 = int(y1 * scale_height)
label = lines[i] if i < len(lines) else ""
label = label.lstrip(", ").strip()
out_lines.append(f"{sx0},{sy0},{sx1},{sy0},{sx1},{sy1},{sx0},{sy1},{label}")
return "\n".join(out_lines)
output_text = post_process(generated_text[0], scale_height, scale_width)
print(output_text)
draw = ImageDraw.Draw(image)
for line in output_text.strip().splitlines():
coords = re.findall(r"-?\d+", line)[:8]
if len(coords) < 8:
continue
xy = list(map(int, coords))
draw.polygon(xy, outline="red")
image.save("output.png")pythontransformers
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
import torch
model = Kosmos2_5ForConditionalGeneration.from_pretrained("merve/kosmos-2.5-ft", device_map="cuda", dtype=torch.bfloat16)
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2.5")
import requests
from PIL import Image
url = "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/ufo-sighting.jpg"
image = Image.open(requests.get(url, stream=True).raw)
import re
prompt = "<ocr>"
inputs = processor(text=prompt, images=image, return_tensors="pt")
height, width = inputs.pop("height"), inputs.pop("width")
raw_width, raw_height = image.size
scale_height = raw_height / height
scale_width = raw_width / width
inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
inputs["flattened_patches"] = inputs["flattened_patches"].to(torch.bfloat16)
generated_ids = model.generate(
**inputs,
max_new_tokens=2000,
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
import re
from PIL import ImageDraw
def post_process(y, scale_height, scale_width):
pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
bboxes_raw = re.findall(pattern, y)
lines = re.split(pattern, y)[1:]
bboxes = [list(map(int, re.findall(r"\d+", bb))) for bb in bboxes_raw]
out_lines = []
for i, box in enumerate(bboxes):
if len(box) != 4:
continue
x0, y0, x1, y1 = box
if x0 >= x1 or y0 >= y1:
continue
sx0 = int(x0 * scale_width)
sy0 = int(y0 * scale_height)
sx1 = int(x1 * scale_width)
sy1 = int(y1 * scale_height)
label = lines[i] if i < len(lines) else ""
label = label.lstrip(", ").strip()
out_lines.append(f"{sx0},{sy0},{sx1},{sy0},{sx1},{sy1},{sx0},{sy1},{label}")
return "\n".join(out_lines)
output_text = post_process(generated_text[0], scale_height, scale_width)
print(output_text)
draw = ImageDraw.Draw(image)
for line in output_text.strip().splitlines():
coords = re.findall(r"-?\d+", line)[:8]
if len(coords) < 8:
continue
xy = list(map(int, coords))
draw.polygon(xy, outline="red")
image.save("output.png")pythontransformers
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
import torch
model = Kosmos2_5ForConditionalGeneration.from_pretrained("merve/kosmos-2.5-ft", device_map="cuda", dtype=torch.bfloat16)
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2.5")
import requests
from PIL import Image
url = "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/ufo-sighting.jpg"
image = Image.open(requests.get(url, stream=True).raw)
import re
prompt = "<ocr>"
inputs = processor(text=prompt, images=image, return_tensors="pt")
height, width = inputs.pop("height"), inputs.pop("width")
raw_width, raw_height = image.size
scale_height = raw_height / height
scale_width = raw_width / width
inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
inputs["flattened_patches"] = inputs["flattened_patches"].to(torch.bfloat16)
generated_ids = model.generate(
**inputs,
max_new_tokens=2000,
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
import re
from PIL import ImageDraw
def post_process(y, scale_height, scale_width):
pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
bboxes_raw = re.findall(pattern, y)
lines = re.split(pattern, y)[1:]
bboxes = [list(map(int, re.findall(r"\d+", bb))) for bb in bboxes_raw]
out_lines = []
for i, box in enumerate(bboxes):
if len(box) != 4:
continue
x0, y0, x1, y1 = box
if x0 >= x1 or y0 >= y1:
continue
sx0 = int(x0 * scale_width)
sy0 = int(y0 * scale_height)
sx1 = int(x1 * scale_width)
sy1 = int(y1 * scale_height)
label = lines[i] if i < len(lines) else ""
label = label.lstrip(", ").strip()
out_lines.append(f"{sx0},{sy0},{sx1},{sy0},{sx1},{sy1},{sx0},{sy1},{label}")
return "\n".join(out_lines)
output_text = post_process(generated_text[0], scale_height, scale_width)
print(output_text)
draw = ImageDraw.Draw(image)
for line in output_text.strip().splitlines():
coords = re.findall(r"-?\d+", line)[:8]
if len(coords) < 8:
continue
xy = list(map(int, coords))
draw.polygon(xy, outline="red")
image.save("output.png")pythontransformers
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
import torch
model = Kosmos2_5ForConditionalGeneration.from_pretrained("merve/kosmos-2.5-ft", device_map="cuda", dtype=torch.bfloat16)
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2.5")
import requests
from PIL import Image
url = "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/ufo-sighting.jpg"
image = Image.open(requests.get(url, stream=True).raw)
import re
prompt = "<ocr>"
inputs = processor(text=prompt, images=image, return_tensors="pt")
height, width = inputs.pop("height"), inputs.pop("width")
raw_width, raw_height = image.size
scale_height = raw_height / height
scale_width = raw_width / width
inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
inputs["flattened_patches"] = inputs["flattened_patches"].to(torch.bfloat16)
generated_ids = model.generate(
**inputs,
max_new_tokens=2000,
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
import re
from PIL import ImageDraw
def post_process(y, scale_height, scale_width):
pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
bboxes_raw = re.findall(pattern, y)
lines = re.split(pattern, y)[1:]
bboxes = [list(map(int, re.findall(r"\d+", bb))) for bb in bboxes_raw]
out_lines = []
for i, box in enumerate(bboxes):
if len(box) != 4:
continue
x0, y0, x1, y1 = box
if x0 >= x1 or y0 >= y1:
continue
sx0 = int(x0 * scale_width)
sy0 = int(y0 * scale_height)
sx1 = int(x1 * scale_width)
sy1 = int(y1 * scale_height)
label = lines[i] if i < len(lines) else ""
label = label.lstrip(", ").strip()
out_lines.append(f"{sx0},{sy0},{sx1},{sy0},{sx1},{sy1},{sx0},{sy1},{label}")
return "\n".join(out_lines)
output_text = post_process(generated_text[0], scale_height, scale_width)
print(output_text)
draw = ImageDraw.Draw(image)
for line in output_text.strip().splitlines():
coords = re.findall(r"-?\d+", line)[:8]
if len(coords) < 8:
continue
xy = list(map(int, coords))
draw.polygon(xy, outline="red")
image.save("output.png")pythontransformers
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
import torch
model = Kosmos2_5ForConditionalGeneration.from_pretrained("merve/kosmos-2.5-ft", device_map="cuda", dtype=torch.bfloat16)
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2.5")
import requests
from PIL import Image
url = "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/ufo-sighting.jpg"
image = Image.open(requests.get(url, stream=True).raw)
import re
prompt = "<ocr>"
inputs = processor(text=prompt, images=image, return_tensors="pt")
height, width = inputs.pop("height"), inputs.pop("width")
raw_width, raw_height = image.size
scale_height = raw_height / height
scale_width = raw_width / width
inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
inputs["flattened_patches"] = inputs["flattened_patches"].to(torch.bfloat16)
generated_ids = model.generate(
**inputs,
max_new_tokens=2000,
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
import re
from PIL import ImageDraw
def post_process(y, scale_height, scale_width):
pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
bboxes_raw = re.findall(pattern, y)
lines = re.split(pattern, y)[1:]
bboxes = [list(map(int, re.findall(r"\d+", bb))) for bb in bboxes_raw]
out_lines = []
for i, box in enumerate(bboxes):
if len(box) != 4:
continue
x0, y0, x1, y1 = box
if x0 >= x1 or y0 >= y1:
continue
sx0 = int(x0 * scale_width)
sy0 = int(y0 * scale_height)
sx1 = int(x1 * scale_width)
sy1 = int(y1 * scale_height)
label = lines[i] if i < len(lines) else ""
label = label.lstrip(", ").strip()
out_lines.append(f"{sx0},{sy0},{sx1},{sy0},{sx1},{sy1},{sx0},{sy1},{label}")
return "\n".join(out_lines)
output_text = post_process(generated_text[0], scale_height, scale_width)
print(output_text)
draw = ImageDraw.Draw(image)
for line in output_text.strip().splitlines():
coords = re.findall(r"-?\d+", line)[:8]
if len(coords) < 8:
continue
xy = list(map(int, coords))
draw.polygon(xy, outline="red")
image.save("output.png")pythontransformers
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
import torch
model = Kosmos2_5ForConditionalGeneration.from_pretrained("merve/kosmos-2.5-ft", device_map="cuda", dtype=torch.bfloat16)
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2.5")
import requests
from PIL import Image
url = "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/ufo-sighting.jpg"
image = Image.open(requests.get(url, stream=True).raw)
import re
prompt = "<ocr>"
inputs = processor(text=prompt, images=image, return_tensors="pt")
height, width = inputs.pop("height"), inputs.pop("width")
raw_width, raw_height = image.size
scale_height = raw_height / height
scale_width = raw_width / width
inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
inputs["flattened_patches"] = inputs["flattened_patches"].to(torch.bfloat16)
generated_ids = model.generate(
**inputs,
max_new_tokens=2000,
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
import re
from PIL import ImageDraw
def post_process(y, scale_height, scale_width):
pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
bboxes_raw = re.findall(pattern, y)
lines = re.split(pattern, y)[1:]
bboxes = [list(map(int, re.findall(r"\d+", bb))) for bb in bboxes_raw]
out_lines = []
for i, box in enumerate(bboxes):
if len(box) != 4:
continue
x0, y0, x1, y1 = box
if x0 >= x1 or y0 >= y1:
continue
sx0 = int(x0 * scale_width)
sy0 = int(y0 * scale_height)
sx1 = int(x1 * scale_width)
sy1 = int(y1 * scale_height)
label = lines[i] if i < len(lines) else ""
label = label.lstrip(", ").strip()
out_lines.append(f"{sx0},{sy0},{sx1},{sy0},{sx1},{sy1},{sx0},{sy1},{label}")
return "\n".join(out_lines)
output_text = post_process(generated_text[0], scale_height, scale_width)
print(output_text)
draw = ImageDraw.Draw(image)
for line in output_text.strip().splitlines():
coords = re.findall(r"-?\d+", line)[:8]
if len(coords) < 8:
continue
xy = list(map(int, coords))
draw.polygon(xy, outline="red")
image.save("output.png")pythontransformers
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
import torch
model = Kosmos2_5ForConditionalGeneration.from_pretrained("merve/kosmos-2.5-ft", device_map="cuda", dtype=torch.bfloat16)
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2.5")
import requests
from PIL import Image
url = "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/ufo-sighting.jpg"
image = Image.open(requests.get(url, stream=True).raw)
import re
prompt = "<ocr>"
inputs = processor(text=prompt, images=image, return_tensors="pt")
height, width = inputs.pop("height"), inputs.pop("width")
raw_width, raw_height = image.size
scale_height = raw_height / height
scale_width = raw_width / width
inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
inputs["flattened_patches"] = inputs["flattened_patches"].to(torch.bfloat16)
generated_ids = model.generate(
**inputs,
max_new_tokens=2000,
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
import re
from PIL import ImageDraw
def post_process(y, scale_height, scale_width):
pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
bboxes_raw = re.findall(pattern, y)
lines = re.split(pattern, y)[1:]
bboxes = [list(map(int, re.findall(r"\d+", bb))) for bb in bboxes_raw]
out_lines = []
for i, box in enumerate(bboxes):
if len(box) != 4:
continue
x0, y0, x1, y1 = box
if x0 >= x1 or y0 >= y1:
continue
sx0 = int(x0 * scale_width)
sy0 = int(y0 * scale_height)
sx1 = int(x1 * scale_width)
sy1 = int(y1 * scale_height)
label = lines[i] if i < len(lines) else ""
label = label.lstrip(", ").strip()
out_lines.append(f"{sx0},{sy0},{sx1},{sy0},{sx1},{sy1},{sx0},{sy1},{label}")
return "\n".join(out_lines)
output_text = post_process(generated_text[0], scale_height, scale_width)
print(output_text)
draw = ImageDraw.Draw(image)
for line in output_text.strip().splitlines():
coords = re.findall(r"-?\d+", line)[:8]
if len(coords) < 8:
continue
xy = list(map(int, coords))
draw.polygon(xy, outline="red")
image.save("output.png")pythontransformers
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
import torch
model = Kosmos2_5ForConditionalGeneration.from_pretrained("merve/kosmos-2.5-ft", device_map="cuda", dtype=torch.bfloat16)
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2.5")
import requests
from PIL import Image
url = "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/ufo-sighting.jpg"
image = Image.open(requests.get(url, stream=True).raw)
import re
prompt = "<ocr>"
inputs = processor(text=prompt, images=image, return_tensors="pt")
height, width = inputs.pop("height"), inputs.pop("width")
raw_width, raw_height = image.size
scale_height = raw_height / height
scale_width = raw_width / width
inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
inputs["flattened_patches"] = inputs["flattened_patches"].to(torch.bfloat16)
generated_ids = model.generate(
**inputs,
max_new_tokens=2000,
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
import re
from PIL import ImageDraw
def post_process(y, scale_height, scale_width):
pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
bboxes_raw = re.findall(pattern, y)
lines = re.split(pattern, y)[1:]
bboxes = [list(map(int, re.findall(r"\d+", bb))) for bb in bboxes_raw]
out_lines = []
for i, box in enumerate(bboxes):
if len(box) != 4:
continue
x0, y0, x1, y1 = box
if x0 >= x1 or y0 >= y1:
continue
sx0 = int(x0 * scale_width)
sy0 = int(y0 * scale_height)
sx1 = int(x1 * scale_width)
sy1 = int(y1 * scale_height)
label = lines[i] if i < len(lines) else ""
label = label.lstrip(", ").strip()
out_lines.append(f"{sx0},{sy0},{sx1},{sy0},{sx1},{sy1},{sx0},{sy1},{label}")
return "\n".join(out_lines)
output_text = post_process(generated_text[0], scale_height, scale_width)
print(output_text)
draw = ImageDraw.Draw(image)
for line in output_text.strip().splitlines():
coords = re.findall(r"-?\d+", line)[:8]
if len(coords) < 8:
continue
xy = list(map(int, coords))
draw.polygon(xy, outline="red")
image.save("output.png")pythontransformers
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
import torch
model = Kosmos2_5ForConditionalGeneration.from_pretrained("merve/kosmos-2.5-ft", device_map="cuda", dtype=torch.bfloat16)
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2.5")
import requests
from PIL import Image
url = "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/ufo-sighting.jpg"
image = Image.open(requests.get(url, stream=True).raw)
import re
prompt = "<ocr>"
inputs = processor(text=prompt, images=image, return_tensors="pt")
height, width = inputs.pop("height"), inputs.pop("width")
raw_width, raw_height = image.size
scale_height = raw_height / height
scale_width = raw_width / width
inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
inputs["flattened_patches"] = inputs["flattened_patches"].to(torch.bfloat16)
generated_ids = model.generate(
**inputs,
max_new_tokens=2000,
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
import re
from PIL import ImageDraw
def post_process(y, scale_height, scale_width):
pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
bboxes_raw = re.findall(pattern, y)
lines = re.split(pattern, y)[1:]
bboxes = [list(map(int, re.findall(r"\d+", bb))) for bb in bboxes_raw]
out_lines = []
for i, box in enumerate(bboxes):
if len(box) != 4:
continue
x0, y0, x1, y1 = box
if x0 >= x1 or y0 >= y1:
continue
sx0 = int(x0 * scale_width)
sy0 = int(y0 * scale_height)
sx1 = int(x1 * scale_width)
sy1 = int(y1 * scale_height)
label = lines[i] if i < len(lines) else ""
label = label.lstrip(", ").strip()
out_lines.append(f"{sx0},{sy0},{sx1},{sy0},{sx1},{sy1},{sx0},{sy1},{label}")
return "\n".join(out_lines)
output_text = post_process(generated_text[0], scale_height, scale_width)
print(output_text)
draw = ImageDraw.Draw(image)
for line in output_text.strip().splitlines():
coords = re.findall(r"-?\d+", line)[:8]
if len(coords) < 8:
continue
xy = list(map(int, coords))
draw.polygon(xy, outline="red")
image.save("output.png")pythontransformers
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
import torch
model = Kosmos2_5ForConditionalGeneration.from_pretrained("merve/kosmos-2.5-ft", device_map="cuda", dtype=torch.bfloat16)
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2.5")
import requests
from PIL import Image
url = "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/ufo-sighting.jpg"
image = Image.open(requests.get(url, stream=True).raw)
import re
prompt = "<ocr>"
inputs = processor(text=prompt, images=image, return_tensors="pt")
height, width = inputs.pop("height"), inputs.pop("width")
raw_width, raw_height = image.size
scale_height = raw_height / height
scale_width = raw_width / width
inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
inputs["flattened_patches"] = inputs["flattened_patches"].to(torch.bfloat16)
generated_ids = model.generate(
**inputs,
max_new_tokens=2000,
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
import re
from PIL import ImageDraw
def post_process(y, scale_height, scale_width):
pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
bboxes_raw = re.findall(pattern, y)
lines = re.split(pattern, y)[1:]
bboxes = [list(map(int, re.findall(r"\d+", bb))) for bb in bboxes_raw]
out_lines = []
for i, box in enumerate(bboxes):
if len(box) != 4:
continue
x0, y0, x1, y1 = box
if x0 >= x1 or y0 >= y1:
continue
sx0 = int(x0 * scale_width)
sy0 = int(y0 * scale_height)
sx1 = int(x1 * scale_width)
sy1 = int(y1 * scale_height)
label = lines[i] if i < len(lines) else ""
label = label.lstrip(", ").strip()
out_lines.append(f"{sx0},{sy0},{sx1},{sy0},{sx1},{sy1},{sx0},{sy1},{label}")
return "\n".join(out_lines)
output_text = post_process(generated_text[0], scale_height, scale_width)
print(output_text)
draw = ImageDraw.Draw(image)
for line in output_text.strip().splitlines():
coords = re.findall(r"-?\d+", line)[:8]
if len(coords) < 8:
continue
xy = list(map(int, coords))
draw.polygon(xy, outline="red")
image.save("output.png")pythontransformers
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
import torch
model = Kosmos2_5ForConditionalGeneration.from_pretrained("merve/kosmos-2.5-ft", device_map="cuda", dtype=torch.bfloat16)
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2.5")
import requests
from PIL import Image
url = "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/ufo-sighting.jpg"
image = Image.open(requests.get(url, stream=True).raw)
import re
prompt = "<ocr>"
inputs = processor(text=prompt, images=image, return_tensors="pt")
height, width = inputs.pop("height"), inputs.pop("width")
raw_width, raw_height = image.size
scale_height = raw_height / height
scale_width = raw_width / width
inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
inputs["flattened_patches"] = inputs["flattened_patches"].to(torch.bfloat16)
generated_ids = model.generate(
**inputs,
max_new_tokens=2000,
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
import re
from PIL import ImageDraw
def post_process(y, scale_height, scale_width):
pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
bboxes_raw = re.findall(pattern, y)
lines = re.split(pattern, y)[1:]
bboxes = [list(map(int, re.findall(r"\d+", bb))) for bb in bboxes_raw]
out_lines = []
for i, box in enumerate(bboxes):
if len(box) != 4:
continue
x0, y0, x1, y1 = box
if x0 >= x1 or y0 >= y1:
continue
sx0 = int(x0 * scale_width)
sy0 = int(y0 * scale_height)
sx1 = int(x1 * scale_width)
sy1 = int(y1 * scale_height)
label = lines[i] if i < len(lines) else ""
label = label.lstrip(", ").strip()
out_lines.append(f"{sx0},{sy0},{sx1},{sy0},{sx1},{sy1},{sx0},{sy1},{label}")
return "\n".join(out_lines)
output_text = post_process(generated_text[0], scale_height, scale_width)
print(output_text)
draw = ImageDraw.Draw(image)
for line in output_text.strip().splitlines():
coords = re.findall(r"-?\d+", line)[:8]
if len(coords) < 8:
continue
xy = list(map(int, coords))
draw.polygon(xy, outline="red")
image.save("output.png")pythontransformers
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
import torch
model = Kosmos2_5ForConditionalGeneration.from_pretrained("merve/kosmos-2.5-ft", device_map="cuda", dtype=torch.bfloat16)
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2.5")
import requests
from PIL import Image
url = "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/ufo-sighting.jpg"
image = Image.open(requests.get(url, stream=True).raw)
import re
prompt = "<ocr>"
inputs = processor(text=prompt, images=image, return_tensors="pt")
height, width = inputs.pop("height"), inputs.pop("width")
raw_width, raw_height = image.size
scale_height = raw_height / height
scale_width = raw_width / width
inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
inputs["flattened_patches"] = inputs["flattened_patches"].to(torch.bfloat16)
generated_ids = model.generate(
**inputs,
max_new_tokens=2000,
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
import re
from PIL import ImageDraw
def post_process(y, scale_height, scale_width):
pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
bboxes_raw = re.findall(pattern, y)
lines = re.split(pattern, y)[1:]
bboxes = [list(map(int, re.findall(r"\d+", bb))) for bb in bboxes_raw]
out_lines = []
for i, box in enumerate(bboxes):
if len(box) != 4:
continue
x0, y0, x1, y1 = box
if x0 >= x1 or y0 >= y1:
continue
sx0 = int(x0 * scale_width)
sy0 = int(y0 * scale_height)
sx1 = int(x1 * scale_width)
sy1 = int(y1 * scale_height)
label = lines[i] if i < len(lines) else ""
label = label.lstrip(", ").strip()
out_lines.append(f"{sx0},{sy0},{sx1},{sy0},{sx1},{sy1},{sx0},{sy1},{label}")
return "\n".join(out_lines)
output_text = post_process(generated_text[0], scale_height, scale_width)
print(output_text)
draw = ImageDraw.Draw(image)
for line in output_text.strip().splitlines():
coords = re.findall(r"-?\d+", line)[:8]
if len(coords) < 8:
continue
xy = list(map(int, coords))
draw.polygon(xy, outline="red")
image.save("output.png")pythontransformers
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
import torch
model = Kosmos2_5ForConditionalGeneration.from_pretrained("merve/kosmos-2.5-ft", device_map="cuda", dtype=torch.bfloat16)
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2.5")
import requests
from PIL import Image
url = "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/ufo-sighting.jpg"
image = Image.open(requests.get(url, stream=True).raw)
import re
prompt = "<ocr>"
inputs = processor(text=prompt, images=image, return_tensors="pt")
height, width = inputs.pop("height"), inputs.pop("width")
raw_width, raw_height = image.size
scale_height = raw_height / height
scale_width = raw_width / width
inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
inputs["flattened_patches"] = inputs["flattened_patches"].to(torch.bfloat16)
generated_ids = model.generate(
**inputs,
max_new_tokens=2000,
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
import re
from PIL import ImageDraw
def post_process(y, scale_height, scale_width):
pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
bboxes_raw = re.findall(pattern, y)
lines = re.split(pattern, y)[1:]
bboxes = [list(map(int, re.findall(r"\d+", bb))) for bb in bboxes_raw]
out_lines = []
for i, box in enumerate(bboxes):
if len(box) != 4:
continue
x0, y0, x1, y1 = box
if x0 >= x1 or y0 >= y1:
continue
sx0 = int(x0 * scale_width)
sy0 = int(y0 * scale_height)
sx1 = int(x1 * scale_width)
sy1 = int(y1 * scale_height)
label = lines[i] if i < len(lines) else ""
label = label.lstrip(", ").strip()
out_lines.append(f"{sx0},{sy0},{sx1},{sy0},{sx1},{sy1},{sx0},{sy1},{label}")
return "\n".join(out_lines)
output_text = post_process(generated_text[0], scale_height, scale_width)
print(output_text)
draw = ImageDraw.Draw(image)
for line in output_text.strip().splitlines():
coords = re.findall(r"-?\d+", line)[:8]
if len(coords) < 8:
continue
xy = list(map(int, coords))
draw.polygon(xy, outline="red")
image.save("output.png")pythontransformers
from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
import torch
model = Kosmos2_5ForConditionalGeneration.from_pretrained("merve/kosmos-2.5-ft", device_map="cuda", dtype=torch.bfloat16)
processor = AutoProcessor.from_pretrained("microsoft/kosmos-2.5")
import requests
from PIL import Image
url = "https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/ufo-sighting.jpg"
image = Image.open(requests.get(url, stream=True).raw)
import re
prompt = "<ocr>"
inputs = processor(text=prompt, images=image, return_tensors="pt")
height, width = inputs.pop("height"), inputs.pop("width")
raw_width, raw_height = image.size
scale_height = raw_height / height
scale_width = raw_width / width
inputs = {k: v.to("cuda") if v is not None else None for k, v in inputs.items()}
inputs["flattened_patches"] = inputs["flattened_patches"].to(torch.bfloat16)
generated_ids = model.generate(
**inputs,
max_new_tokens=2000,
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
import re
from PIL import ImageDraw
def post_process(y, scale_height, scale_width):
pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
bboxes_raw = re.findall(pattern, y)
lines = re.split(pattern, y)[1:]
bboxes = [list(map(int, re.findall(r"\d+", bb))) for bb in bboxes_raw]
out_lines = []
for i, box in enumerate(bboxes):
if len(box) != 4:
continue
x0, y0, x1, y1 = box
if x0 >= x1 or y0 >= y1:
continue
sx0 = int(x0 * scale_width)
sy0 = int(y0 * scale_height)
sx1 = int(x1 * scale_width)
sy1 = int(y1 * scale_height)
label = lines[i] if i < len(lines) else ""
label = label.lstrip(", ").strip()
out_lines.append(f"{sx0},{sy0},{sx1},{sy0},{sx1},{sy1},{sx0},{sy1},{label}")
return "\n".join(out_lines)
output_text = post_process(generated_text[0], scale_height, scale_width)
print(output_text)
draw = ImageDraw.Draw(image)
for line in output_text.strip().splitlines():
coords = re.findall(r"-?\d+", line)[:8]
if len(coords) < 8:
continue
xy = list(map(int, coords))
draw.polygon(xy, outline="red")
image.save("output.png")Deploy This Model
Production-ready deployment in minutes
Together.ai
Instant API access to this model
Production-ready inference API. Start free, scale to millions.
Try Free APIReplicate
One-click model deployment
Run models in the cloud with simple API. No DevOps required.
Deploy NowDisclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.