Qwen3-VL-4B-catmus

Name: Qwen3-VL-4B-catmus
Author: small-models-for-glam

4.0B

3 languages

—

small-models-for-glam

Other

OTHER

4B params

New

0 downloads

Early-stage

Try on Hugging Face Add to Compare

Edge AI:

Mobile

Laptop

Server

9GB+ RAM

Mobile

Laptop

Server

Quick Summary

This model is a fine-tuned version of Qwen/Qwen3-VL-4B-Instruct for transcribing line-level medieval manuscripts from images.

Device Compatibility

Mobile

4-6GB RAM

Laptop

16GB RAM

Server

GPU

Minimum Recommended

4GB+ RAM

Code Examples

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Quick startpythontransformers

from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
from peft import PeftModel
from PIL import Image

# Load model and processor
base_model = "Qwen/Qwen3-VL-4B-Instruct"
adapter_model = "small-models-for-glam/Qwen3-VL-4B-catmus"

model = Qwen3VLForConditionalGeneration.from_pretrained(
    base_model,
    torch_dtype="auto",
    device_map="auto"
)
model = PeftModel.from_pretrained(model, adapter_model)
processor = AutoProcessor.from_pretrained(base_model)

# Load your image
image = Image.open("path/to/your/manuscript_image.jpg")

# Prepare the message
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": "Transcribe the text shown in this image."},
        ],
    },
]

# Generate transcription
inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
).to(model.device)

generated_ids = model.generate(**inputs, max_new_tokens=256)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
transcription = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]

print(transcription)

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Citationsbibtex

@article{Qwen3-VL,
  title={Qwen3-VL: Large Vision Language Models Pretrained on Massive Data},
  author={Qwen Team},
  journal={arXiv preprint},
  year={2024}
}

Deploy This Model

Production-ready deployment in minutes

Together.ai

Instant API access to this model

Fastest API

Production-ready inference API. Start free, scale to millions.

Try Free API

Replicate

One-click model deployment

Easiest Setup

Run models in the cloud with simple API. No DevOps required.

Deploy Now

Disclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.