LaSER-Qwen3-8B
1
license:mit
by
Alibaba-NLP
Embedding Model
OTHER
8B params
New
0 downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
18GB+ RAM
Mobile
Laptop
Server
Quick Summary
AI model with specialized capabilities.
Device Compatibility
Mobile
4-6GB RAM
Laptop
16GB RAM
Server
GPU
Minimum Recommended
8GB+ RAM
Code Examples
Usagepythontransformers
import torch
import torch.nn.functional as F
from transformers import AutoModelForCausalLM, AutoTokenizer
def laser_encode(model, tokenizer, texts, max_length=512, num_thinking_steps=3):
"""Encode texts using LaSER's latent thinking mechanism."""
device = next(model.parameters()).device
batch = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to(device)
input_ids, attention_mask = batch["input_ids"], batch["attention_mask"]
batch_size = input_ids.size(0)
thinking_slots = num_thinking_steps - 1
eos_id = tokenizer.eos_token_id
if thinking_slots > 0:
eos_padding = torch.full((batch_size, thinking_slots), eos_id, dtype=input_ids.dtype, device=device)
mask_padding = torch.ones((batch_size, thinking_slots), dtype=attention_mask.dtype, device=device)
input_ids = torch.cat([input_ids, eos_padding], dim=1)
attention_mask = torch.cat([attention_mask, mask_padding], dim=1)
input_embeds = model.get_input_embeddings()(input_ids)
embedding_table = model.get_input_embeddings().weight
base_seq_len = input_embeds.size(1) - thinking_slots
past_key_values = None
hidden_steps = []
for step_idx in range(thinking_slots):
pos = base_seq_len + step_idx
step_embeds = input_embeds[:, :pos, :] if past_key_values is None else input_embeds[:, pos-1:pos, :]
step_mask = attention_mask[:, :pos]
outputs = model(inputs_embeds=step_embeds, attention_mask=step_mask,
output_hidden_states=True, past_key_values=past_key_values,
use_cache=True, return_dict=True)
hidden_steps.append(outputs.hidden_states[-1][:, -1, :])
token_probs = torch.softmax(outputs.logits[:, -1, :], dim=-1)
new_embed = token_probs @ embedding_table
past_key_values = outputs.past_key_values
pre = input_embeds[:, :pos, :]
post = input_embeds[:, pos+1:, :]
input_embeds = torch.cat([pre, new_embed.unsqueeze(1), post], dim=1)
final_embeds = input_embeds[:, -1:, :] if past_key_values else input_embeds
outputs = model(inputs_embeds=final_embeds, attention_mask=attention_mask,
output_hidden_states=True, past_key_values=past_key_values,
use_cache=True, return_dict=True)
hidden_steps.append(outputs.hidden_states[-1][:, -1, :])
embeddings = torch.stack(hidden_steps, dim=1).mean(dim=1)
return F.normalize(embeddings, p=2, dim=-1)
# Load model
model_name = "Alibaba-NLP/LaSER-Qwen3-8B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.padding_side = "left"
if tokenizer.pad_token_id is None:
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
model_name, torch_dtype=torch.float16, trust_remote_code=True
).cuda().eval()
# Encode queries and documents
with torch.inference_mode():
query_emb = laser_encode(model, tokenizer, ["why is the sky blue"], num_thinking_steps=3)
doc_emb = laser_encode(model, tokenizer, ["Rayleigh scattering makes short wavelengths scatter more strongly"], num_thinking_steps=3)
# Compute similarity
similarity = (query_emb @ doc_emb.T).item()
print(f"Cosine similarity: {similarity:.4f}")Compute similaritypython
queries = [
"What causes tides in the ocean?",
"How does photosynthesis convert light to energy?",
"Why do metals conduct electricity?",
]
with torch.inference_mode():
query_embeddings = laser_encode(model, tokenizer, queries, num_thinking_steps=3)
print(f"Batch embeddings shape: {query_embeddings.shape}") # (3, 4096)Deploy This Model
Production-ready deployment in minutes
Together.ai
Instant API access to this model
Production-ready inference API. Start free, scale to millions.
Try Free APIReplicate
One-click model deployment
Run models in the cloud with simple API. No DevOps required.
Deploy NowDisclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.