Qwen3-Coder-Next-int2-mixed-AutoRound
307
1
license:apache-2.0
by
YCWTG
Language Model
OTHER
New
307 downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
Unknown
Mobile
Laptop
Server
Quick Summary
AI model with specialized capabilities.
Code Examples
Model Sizepythontransformers
import math
import os
os.environ.setdefault(
"PYTORCH_ALLOC_CONF",
# Keep a safer allocator default and transparently migrate the deprecated env var.
os.environ.pop("PYTORCH_CUDA_ALLOC_CONF", None) or "expandable_segments:True",
)
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
MODEL_NAME = "YCWTG/Qwen3-Coder-Next-int2-mixed-AutoRound"
AUTO_MAX_TOKENS = True
MANUAL_MAX_NEW_TOKENS = 128
AUTO_MAX_TOKENS_RATIO = 1.5
HAS_CUDA = torch.cuda.is_available()
# Read total VRAM once and use it to choose the default loading mode.
GPU_TOTAL_MIB = torch.cuda.get_device_properties(0).total_memory // (1024 ** 2) if HAS_CUDA else 0
# 32GB-class GPUs default to False; smaller GPUs default to True.
ENABLE_CPU_OFFLOAD = HAS_CUDA and GPU_TOTAL_MIB < 32000
MAX_MEMORY = {0: "18GiB", "cpu": "64GiB"} if ENABLE_CPU_OFFLOAD else {0: "22GiB", "cpu": "16GiB"}
def get_input_device(model):
# With device_map="auto", the first usable device may not be model.device.
device_map = getattr(model, "hf_device_map", None)
cpu_device = None
if isinstance(device_map, dict):
for loc in device_map.values():
if isinstance(loc, int):
return torch.device(f"cuda:{loc}")
if isinstance(loc, str):
if loc.startswith("cuda"):
return torch.device(loc)
if loc.startswith("cpu"):
cpu_device = torch.device("cpu")
return cpu_device or next(model.parameters()).device
def load_model():
print("Loading model...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True, use_fast=True)
# Use EOS as PAD to avoid warnings for chat generation on models without a pad token.
tokenizer.pad_token = tokenizer.pad_token or tokenizer.eos_token
tokenizer.padding_side = "left"
model_kwargs = {
"pretrained_model_name_or_path": MODEL_NAME,
"dtype": torch.bfloat16,
"trust_remote_code": True,
"low_cpu_mem_usage": True,
"device_map": "auto" if HAS_CUDA else "cpu",
}
if HAS_CUDA:
print(f"GPU total memory: {GPU_TOTAL_MIB} MiB")
model_kwargs["max_memory"] = MAX_MEMORY
if ENABLE_CPU_OFFLOAD:
model_kwargs["offload_buffers"] = True
print("CPU offload: ON")
else:
print("CPU offload: OFF (GPU preferred, small CPU spill enabled)")
else:
print("CUDA not available, running on CPU")
try:
model = AutoModelForCausalLM.from_pretrained(**model_kwargs)
except RuntimeError as e:
# Give a beginner-friendly hint instead of only showing the raw stack trace.
if "out of memory" in str(e).lower():
print("\nCUDA OOM while loading the model.")
print("Close other GPU programs, or set ENABLE_CPU_OFFLOAD = True and run again.")
raise
model.eval()
return model, tokenizer
def multiline_input():
print('User (type "END" on a single line to send, type "exit" to quit):')
lines = []
while True:
line = input()
text = line.strip()
if text.lower() in {"exit", "quit"}:
return None
if text == "END":
break
lines.append(line)
return "\n".join(lines)
def build_input_ids(tokenizer, messages, device):
if getattr(tokenizer, "chat_template", None):
# Preferred path for chat models: let the tokenizer build the prompt format.
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
else:
# Generic fallback for tokenizers without a built-in chat template.
prompt = "\n".join(
[f"{'User' if m['role'] == 'user' else 'Assistant'}: {m['content']}" for m in messages]
+ ["Assistant:"]
)
return tokenizer(prompt, return_tensors="pt")["input_ids"].to(device)
def chat_loop(model, tokenizer):
print("\n===== Chat Started =====\n")
print(f"Auto max_tokens: {'ON' if AUTO_MAX_TOKENS else 'OFF'}")
if not AUTO_MAX_TOKENS:
print(f"Manual max_new_tokens: {MANUAL_MAX_NEW_TOKENS}")
print(
"Tip: Set ENABLE_CPU_OFFLOAD = False for a faster full-GPU attempt."
if ENABLE_CPU_OFFLOAD
else "Tip: If max_tokens is too large and you hit CUDA OOM, set ENABLE_CPU_OFFLOAD = True."
)
messages = []
device = get_input_device(model)
print(f"Input device: {device}")
while True:
user_text = multiline_input()
if user_text is None:
break
messages.append({"role": "user", "content": user_text})
input_ids = build_input_ids(tokenizer, messages, device)
prompt_tokens = int(input_ids.shape[-1])
# Auto mode scales output length with prompt length (1.5x by default).
max_new_tokens = max(1, math.ceil(prompt_tokens * AUTO_MAX_TOKENS_RATIO)) if AUTO_MAX_TOKENS else int(MANUAL_MAX_NEW_TOKENS)
print(f"Prompt tokens: {prompt_tokens}")
print(f"max_new_tokens: {max_new_tokens}")
try:
with torch.inference_mode():
output_ids = model.generate(
input_ids=input_ids,
max_new_tokens=max_new_tokens,
do_sample=True,
temperature=1.0,
top_p=0.95,
top_k=40,
use_cache=False,
pad_token_id=tokenizer.pad_token_id,
eos_token_id=tokenizer.eos_token_id,
)
except RuntimeError as e:
error_text = str(e).lower()
if HAS_CUDA and ("cublas_status_alloc_failed" in error_text or "out of memory" in error_text):
# Clear cached blocks so the next try starts from a cleaner CUDA state.
torch.cuda.empty_cache()
print("\nCUDA OOM during generation.")
print("Set ENABLE_CPU_OFFLOAD = True, or disable AUTO_MAX_TOKENS and lower MANUAL_MAX_NEW_TOKENS.")
messages.pop()
continue
raise
reply_text = tokenizer.decode(output_ids[0, input_ids.shape[-1]:], skip_special_tokens=True)
print(f"\nAssistant:\n{reply_text}\n")
messages.append({"role": "assistant", "content": reply_text})
if __name__ == "__main__":
model, tokenizer = load_model()
chat_loop(model, tokenizer)Deploy This Model
Production-ready deployment in minutes
Together.ai
Instant API access to this model
Production-ready inference API. Start free, scale to millions.
Try Free APIReplicate
One-click model deployment
Run models in the cloud with simple API. No DevOps required.
Deploy NowDisclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.