Qwen3-Coder-Next-int2-mixed-AutoRound

307
1
license:apache-2.0
by
YCWTG
Language Model
OTHER
New
307 downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
Unknown
Mobile
Laptop
Server
Quick Summary

AI model with specialized capabilities.

Code Examples

Model Sizepythontransformers
import math
import os

os.environ.setdefault(
    "PYTORCH_ALLOC_CONF",
    # Keep a safer allocator default and transparently migrate the deprecated env var.
    os.environ.pop("PYTORCH_CUDA_ALLOC_CONF", None) or "expandable_segments:True",
)

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

MODEL_NAME = "YCWTG/Qwen3-Coder-Next-int2-mixed-AutoRound"
AUTO_MAX_TOKENS = True
MANUAL_MAX_NEW_TOKENS = 128
AUTO_MAX_TOKENS_RATIO = 1.5

HAS_CUDA = torch.cuda.is_available()
# Read total VRAM once and use it to choose the default loading mode.
GPU_TOTAL_MIB = torch.cuda.get_device_properties(0).total_memory // (1024 ** 2) if HAS_CUDA else 0
# 32GB-class GPUs default to False; smaller GPUs default to True.
ENABLE_CPU_OFFLOAD = HAS_CUDA and GPU_TOTAL_MIB < 32000
MAX_MEMORY = {0: "18GiB", "cpu": "64GiB"} if ENABLE_CPU_OFFLOAD else {0: "22GiB", "cpu": "16GiB"}


def get_input_device(model):
    # With device_map="auto", the first usable device may not be model.device.
    device_map = getattr(model, "hf_device_map", None)
    cpu_device = None
    if isinstance(device_map, dict):
        for loc in device_map.values():
            if isinstance(loc, int):
                return torch.device(f"cuda:{loc}")
            if isinstance(loc, str):
                if loc.startswith("cuda"):
                    return torch.device(loc)
                if loc.startswith("cpu"):
                    cpu_device = torch.device("cpu")
    return cpu_device or next(model.parameters()).device


def load_model():
    print("Loading model...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True, use_fast=True)
    # Use EOS as PAD to avoid warnings for chat generation on models without a pad token.
    tokenizer.pad_token = tokenizer.pad_token or tokenizer.eos_token
    tokenizer.padding_side = "left"

    model_kwargs = {
        "pretrained_model_name_or_path": MODEL_NAME,
        "dtype": torch.bfloat16,
        "trust_remote_code": True,
        "low_cpu_mem_usage": True,
        "device_map": "auto" if HAS_CUDA else "cpu",
    }
    if HAS_CUDA:
        print(f"GPU total memory: {GPU_TOTAL_MIB} MiB")
        model_kwargs["max_memory"] = MAX_MEMORY
        if ENABLE_CPU_OFFLOAD:
            model_kwargs["offload_buffers"] = True
            print("CPU offload: ON")
        else:
            print("CPU offload: OFF (GPU preferred, small CPU spill enabled)")
    else:
        print("CUDA not available, running on CPU")

    try:
        model = AutoModelForCausalLM.from_pretrained(**model_kwargs)
    except RuntimeError as e:
        # Give a beginner-friendly hint instead of only showing the raw stack trace.
        if "out of memory" in str(e).lower():
            print("\nCUDA OOM while loading the model.")
            print("Close other GPU programs, or set ENABLE_CPU_OFFLOAD = True and run again.")
        raise
    model.eval()
    return model, tokenizer


def multiline_input():
    print('User (type "END" on a single line to send, type "exit" to quit):')
    lines = []
    while True:
        line = input()
        text = line.strip()
        if text.lower() in {"exit", "quit"}:
            return None
        if text == "END":
            break
        lines.append(line)
    return "\n".join(lines)


def build_input_ids(tokenizer, messages, device):
    if getattr(tokenizer, "chat_template", None):
        # Preferred path for chat models: let the tokenizer build the prompt format.
        prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    else:
        # Generic fallback for tokenizers without a built-in chat template.
        prompt = "\n".join(
            [f"{'User' if m['role'] == 'user' else 'Assistant'}: {m['content']}" for m in messages]
            + ["Assistant:"]
        )
    return tokenizer(prompt, return_tensors="pt")["input_ids"].to(device)


def chat_loop(model, tokenizer):
    print("\n===== Chat Started =====\n")
    print(f"Auto max_tokens: {'ON' if AUTO_MAX_TOKENS else 'OFF'}")
    if not AUTO_MAX_TOKENS:
        print(f"Manual max_new_tokens: {MANUAL_MAX_NEW_TOKENS}")
    print(
        "Tip: Set ENABLE_CPU_OFFLOAD = False for a faster full-GPU attempt."
        if ENABLE_CPU_OFFLOAD
        else "Tip: If max_tokens is too large and you hit CUDA OOM, set ENABLE_CPU_OFFLOAD = True."
    )

    messages = []
    device = get_input_device(model)
    print(f"Input device: {device}")

    while True:
        user_text = multiline_input()
        if user_text is None:
            break

        messages.append({"role": "user", "content": user_text})
        input_ids = build_input_ids(tokenizer, messages, device)
        prompt_tokens = int(input_ids.shape[-1])
        # Auto mode scales output length with prompt length (1.5x by default).
        max_new_tokens = max(1, math.ceil(prompt_tokens * AUTO_MAX_TOKENS_RATIO)) if AUTO_MAX_TOKENS else int(MANUAL_MAX_NEW_TOKENS)

        print(f"Prompt tokens: {prompt_tokens}")
        print(f"max_new_tokens: {max_new_tokens}")

        try:
            with torch.inference_mode():
                output_ids = model.generate(
                    input_ids=input_ids,
                    max_new_tokens=max_new_tokens,
                    do_sample=True,
                    temperature=1.0,
                    top_p=0.95,
                    top_k=40,
                    use_cache=False,
                    pad_token_id=tokenizer.pad_token_id,
                    eos_token_id=tokenizer.eos_token_id,
                )
        except RuntimeError as e:
            error_text = str(e).lower()
            if HAS_CUDA and ("cublas_status_alloc_failed" in error_text or "out of memory" in error_text):
                # Clear cached blocks so the next try starts from a cleaner CUDA state.
                torch.cuda.empty_cache()
                print("\nCUDA OOM during generation.")
                print("Set ENABLE_CPU_OFFLOAD = True, or disable AUTO_MAX_TOKENS and lower MANUAL_MAX_NEW_TOKENS.")
                messages.pop()
                continue
            raise

        reply_text = tokenizer.decode(output_ids[0, input_ids.shape[-1]:], skip_special_tokens=True)
        print(f"\nAssistant:\n{reply_text}\n")
        messages.append({"role": "assistant", "content": reply_text})


if __name__ == "__main__":
    model, tokenizer = load_model()
    chat_loop(model, tokenizer)

Deploy This Model

Production-ready deployment in minutes

Together.ai

Instant API access to this model

Fastest API

Production-ready inference API. Start free, scale to millions.

Try Free API

Replicate

One-click model deployment

Easiest Setup

Run models in the cloud with simple API. No DevOps required.

Deploy Now

Disclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.