LFM2-VL-450M-ONNX

72
5
by
onnx-community
Image Model
OTHER
New
72 downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
Unknown
Mobile
Laptop
Server
Quick Summary

AI model with specialized capabilities.

Code Examples

🏃 How to run LFM2-VLpythontransformers
from transformers import AutoConfig, AutoProcessor
from transformers.image_utils import load_image
import onnxruntime
import numpy as np
from huggingface_hub import hf_hub_download

# 1. Load config, processor, and model
model_id = "onnx-community/LFM2-VL-450M-ONNX"
config = AutoConfig.from_pretrained(model_id)
processor = AutoProcessor.from_pretrained(model_id)

local_dir = 'LFM2-VL-450M-ONNX'
vision_model_path = hf_hub_download(model_id, "vision_encoder.onnx", subfolder="onnx", local_dir=local_dir)         # Download vision graph
hf_hub_download(model_id, "vision_encoder.onnx_data", subfolder="onnx", local_dir=local_dir)                        # Download vision weights
embed_model_path = hf_hub_download(model_id, "embed_tokens.onnx", subfolder="onnx", local_dir=local_dir)            # Download embed_tokens graph
hf_hub_download(model_id, "embed_tokens.onnx_data", subfolder="onnx", local_dir=local_dir)                          # Download embed_tokens weights
decoder_model_path = hf_hub_download(model_id, "decoder_model_merged.onnx", subfolder="onnx", local_dir=local_dir)  # Download decoder graph
hf_hub_download(model_id, "decoder_model_merged.onnx_data", subfolder="onnx", local_dir=local_dir)                  # Download decoder weights

## Load sessions
providers = ['CPUExecutionProvider']
vision_session = onnxruntime.InferenceSession(vision_model_path, providers=providers)
embed_session = onnxruntime.InferenceSession(embed_model_path, providers=providers)
decoder_session = onnxruntime.InferenceSession(decoder_model_path, providers=providers)

## Set config values
text_config = config.text_config
num_key_value_heads = text_config.num_key_value_heads
head_dim = text_config.hidden_size // text_config.num_attention_heads
num_hidden_layers = text_config.num_hidden_layers
eos_token_id = text_config.eos_token_id
hidden_size = text_config.hidden_size
conv_L_cache = text_config.conv_L_cache
layer_types = text_config.layer_types
image_token_index = config.image_token_index

# 2. Prepare inputs
image_url = "https://www.ilankelman.org/stopsigns/australia.jpg"
image = load_image(image_url)
messages = [
  {
    "role": "user",
    "content": [
      {"type": "image", "image": image},
      {"type": "text", "text": "What is in this image?"},
    ],
  },
]
inputs = processor.apply_chat_template(
  messages,
  add_generation_prompt=True,
  return_tensors="pt",
  return_dict=True,
  tokenize=True,
)

input_ids = inputs['input_ids'].numpy()
attention_mask = inputs['attention_mask'].numpy()
has_vision_inputs = 'pixel_values' in inputs
pixel_values = inputs['pixel_values'].numpy() if has_vision_inputs else None
pixel_attention_mask = inputs['pixel_attention_mask'].numpy().astype(np.int64) if has_vision_inputs else None
spatial_shapes = inputs['spatial_shapes'].numpy() if has_vision_inputs else None

batch_size = input_ids.shape[0]
past_cache_values = {}
for i in range(num_hidden_layers):
  if layer_types[i] == 'full_attention':
    for kv in ('key', 'value'):
      past_cache_values[f'past_key_values.{i}.{kv}'] = np.zeros([batch_size, num_key_value_heads, 0, head_dim], dtype=np.float32)
  elif layer_types[i] == 'conv':
    past_cache_values[f'past_conv.{i}'] = np.zeros([batch_size, hidden_size, conv_L_cache], dtype=np.float32)
  else:
    raise ValueError(f"Unsupported layer type: {layer_types[i]}")

# 3. Generation loop
max_new_tokens = 1024
generated_tokens = np.array([[]], dtype=np.int64)
image_features = None
for i in range(max_new_tokens):
  inputs_embeds = embed_session.run(None, {'input_ids': input_ids})[0]

  if has_vision_inputs and image_features is None:
    ## Only compute vision features if not already computed
    image_features = vision_session.run(None, dict(
      pixel_values=pixel_values,
      pixel_attention_mask=pixel_attention_mask,
      spatial_shapes=spatial_shapes,
    ))[0]

    ## Merge text and vision embeddings
    inputs_embeds[input_ids == image_token_index] = image_features.reshape(-1, image_features.shape[-1])

  logits, *present_cache_values = decoder_session.run(None, dict(
    inputs_embeds=inputs_embeds,
    attention_mask=attention_mask,
    **past_cache_values,
  ))

  ## Update values for next generation loop
  input_ids = logits[:, -1].argmax(-1, keepdims=True)
  attention_mask = np.concatenate([attention_mask, np.ones((batch_size, 1), dtype=attention_mask.dtype)], axis=-1)
  for j, key in enumerate(past_cache_values):
    past_cache_values[key] = present_cache_values[j]

  generated_tokens = np.concatenate([generated_tokens, input_ids], axis=-1)
  if np.isin(input_ids, eos_token_id).any():
    break

  ## (Optional) Streaming
  print(processor.decode(input_ids[0], skip_special_tokens=False), end='', flush=True)
print()

# 4. Output result
print(processor.batch_decode(generated_tokens, skip_special_tokens=False)[0])

Deploy This Model

Production-ready deployment in minutes

Together.ai

Instant API access to this model

Fastest API

Production-ready inference API. Start free, scale to millions.

Try Free API

Replicate

One-click model deployment

Easiest Setup

Run models in the cloud with simple API. No DevOps required.

Deploy Now

Disclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.