Qwen3-Coder-30B-A3B-Instruct-int4-mixed-AutoRound

225
license:apache-2.0
by
YCWTG
Language Model
OTHER
30B params
New
225 downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
68GB+ RAM
Mobile
Laptop
Server
Quick Summary

AI model with specialized capabilities.

Device Compatibility

Mobile
4-6GB RAM
Laptop
16GB RAM
Server
GPU
Minimum Recommended
28GB+ RAM

Code Examples

Quickstartpythonvllm
import argparse
import atexit
import json
import os
import shutil
import subprocess
import sys
import time
import urllib.error
import urllib.request


def multiline_input():
    print('User (type "END" on a single line to send, "exit" to quit):')
    lines = []
    while True:
        line = input()
        text = line.strip()
        if text.lower() in {"exit", "quit"}:
            return None
        if text == "END":
            break
        lines.append(line)
    return "\n".join(lines)


def resolve_client_host(host):
    return "127.0.0.1" if host in {"0.0.0.0", "::"} else host


def launch_vllm(args, api_key):
    cmd = [
        "vllm",
        "serve",
        args.model,
        "--served-model-name",
        args.served_model_name,
        "--host",
        args.host,
        "--port",
        str(args.port),
        "--max-model-len",
        str(args.max_model_len),
        "--tool-call-parser",
        args.tool_call_parser,
        "--attention-backend",
        args.attention_backend,
        "--api-key",
        api_key,
    ]
    if args.enable_auto_tool_choice:
        cmd.append("--enable-auto-tool-choice")

    print("Launching vLLM:")
    print(" ".join(cmd))
    try:
        return subprocess.Popen(cmd)
    except FileNotFoundError as e:
        raise RuntimeError("vllm command not found. Activate an environment that has vllm installed.") from e


def stop_vllm(proc):
    if proc and proc.poll() is None:
        proc.terminate()
        try:
            proc.wait(timeout=10)
        except subprocess.TimeoutExpired:
            proc.kill()


def wait_vllm_ready(base_url, api_key, timeout_sec=180):
    deadline = time.time() + timeout_sec
    url = f"{base_url}/v1/models"
    req = urllib.request.Request(url=url, headers={"Authorization": f"Bearer {api_key}"})
    while time.time() < deadline:
        try:
            with urllib.request.urlopen(req, timeout=3) as resp:
                if resp.status == 200:
                    return True
        except urllib.error.URLError:
            pass
        time.sleep(1)
    return False


def chat_once(base_url, model_name, messages, api_key):
    payload = {"model": model_name, "messages": messages}
    req = urllib.request.Request(
        url=f"{base_url}/v1/chat/completions",
        data=json.dumps(payload, ensure_ascii=False).encode("utf-8"),
        headers={
            "Content-Type": "application/json",
            "Authorization": f"Bearer {api_key}",
        },
        method="POST",
    )
    with urllib.request.urlopen(req, timeout=600) as resp:
        data = json.loads(resp.read().decode("utf-8"))
    return data["choices"][0]["message"]


def chat_loop(base_url, model_name, api_key):
    print("\n===== Chat Started =====\n")
    messages = []

    while True:
        user_text = multiline_input()
        if user_text is None:
            break

        messages.append({"role": "user", "content": user_text})
        try:
            assistant_msg = chat_once(base_url, model_name, messages, api_key)
        except Exception as e:
            print(f"\nRequest failed: {e}\n")
            messages.pop()
            continue

        content = assistant_msg.get("content")
        tool_calls = assistant_msg.get("tool_calls")

        if content:
            print(f"\nAssistant:\n{content}\n")
        elif tool_calls:
            print("\nAssistant(tool_calls):")
            print(json.dumps(tool_calls, ensure_ascii=False, indent=2))
            print()
        else:
            print("\nAssistant:\n(empty response)\n")

        normalized_msg = {"role": "assistant", "content": content or ""}
        if tool_calls:
            normalized_msg["tool_calls"] = tool_calls
        messages.append(normalized_msg)


def build_client_command(args):
    cmd = [
        sys.executable,
        os.path.abspath(__file__),
        "--_client",
        "--model",
        args.model,
        "--served-model-name",
        args.served_model_name,
        "--host",
        args.host,
        "--port",
        str(args.port),
        "--max-model-len",
        str(args.max_model_len),
        "--tool-call-parser",
        args.tool_call_parser,
        "--attention-backend",
        args.attention_backend,
        "--enable-auto-tool-choice" if args.enable_auto_tool_choice else "--no-enable-auto-tool-choice",
    ]
    return cmd


def spawn_chat_terminal(args, api_key):
    client_cmd = build_client_command(args)
    env = os.environ.copy()
    env["VLLM_API_KEY"] = api_key

    terminal_cmd = None
    if os.name == "nt":
        # Open a new cmd window on Windows and keep it alive for interactive chat.
        terminal_cmd = [
            "cmd",
            "/c",
            "start",
            "",
            "cmd",
            "/k",
            subprocess.list2cmdline(client_cmd),
        ]
    elif shutil.which("gnome-terminal"):
        terminal_cmd = ["gnome-terminal", "--", *client_cmd]
    elif shutil.which("x-terminal-emulator"):
        terminal_cmd = ["x-terminal-emulator", "-e", *client_cmd]

    if not terminal_cmd:
        return False

    try:
        subprocess.Popen(terminal_cmd, env=env)
        return True
    except Exception as e:
        print(f"Failed to open a new terminal automatically: {e}")
        return False


def parse_args():
    parser = argparse.ArgumentParser(description="Minimal local vLLM chat script")
    parser.add_argument("--_client", action="store_true", help=argparse.SUPPRESS)
    parser.add_argument("--model", default="YCWTG/Qwen3-Coder-30B-A3B-Instruct-int4-mixed-AutoRound")
    parser.add_argument(
        "--served-model-name",
        default="YCWTG/Qwen3-Coder-30B-A3B-Instruct-int4-mixed-AutoRound",
    )
    parser.add_argument("--host", default="0.0.0.0")
    parser.add_argument("--port", type=int, default=8000)
    parser.add_argument("--max-model-len", type=int, default=106666)
    parser.add_argument(
        "--enable-auto-tool-choice",
        action=argparse.BooleanOptionalAction,
        default=True,
    )
    parser.add_argument("--tool-call-parser", default="qwen3_coder")
    parser.add_argument("--attention-backend", default="FLASHINFER")
    return parser.parse_args()


def main():
    args = parse_args()
    api_key = os.environ.get("VLLM_API_KEY") or "local-dev-key"
    base_url = f"http://{resolve_client_host(args.host)}:{args.port}"
    if args._client:
        chat_loop(base_url, args.served_model_name, api_key)
        return

    proc = launch_vllm(args, api_key)
    atexit.register(stop_vllm, proc)

    print(f"Waiting for service to become ready: {base_url}")
    if not wait_vllm_ready(base_url, api_key):
        print("vLLM startup timed out. Check server logs above.")
        stop_vllm(proc)
        sys.exit(1)

    if spawn_chat_terminal(args, api_key):
        print("Model is ready. Opened a new terminal for chat; this terminal keeps server logs.")
        print("Press Ctrl+C here to stop vLLM.")
        try:
            proc.wait()
        except KeyboardInterrupt:
            print("\nInterrupted. Stopping vLLM...")
    else:
        print("No supported terminal found. Falling back to chat in this terminal.")
        chat_loop(base_url, args.served_model_name, api_key)


if __name__ == "__main__":
    main()

Deploy This Model

Production-ready deployment in minutes

Together.ai

Instant API access to this model

Fastest API

Production-ready inference API. Start free, scale to millions.

Try Free API

Replicate

One-click model deployment

Easiest Setup

Run models in the cloud with simple API. No DevOps required.

Deploy Now

Disclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.