Qwen3-Coder-30B-A3B-Instruct-int4-mixed-AutoRound
225
license:apache-2.0
by
YCWTG
Language Model
OTHER
30B params
New
225 downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
68GB+ RAM
Mobile
Laptop
Server
Quick Summary
AI model with specialized capabilities.
Device Compatibility
Mobile
4-6GB RAM
Laptop
16GB RAM
Server
GPU
Minimum Recommended
28GB+ RAM
Code Examples
Quickstartpythonvllm
import argparse
import atexit
import json
import os
import shutil
import subprocess
import sys
import time
import urllib.error
import urllib.request
def multiline_input():
print('User (type "END" on a single line to send, "exit" to quit):')
lines = []
while True:
line = input()
text = line.strip()
if text.lower() in {"exit", "quit"}:
return None
if text == "END":
break
lines.append(line)
return "\n".join(lines)
def resolve_client_host(host):
return "127.0.0.1" if host in {"0.0.0.0", "::"} else host
def launch_vllm(args, api_key):
cmd = [
"vllm",
"serve",
args.model,
"--served-model-name",
args.served_model_name,
"--host",
args.host,
"--port",
str(args.port),
"--max-model-len",
str(args.max_model_len),
"--tool-call-parser",
args.tool_call_parser,
"--attention-backend",
args.attention_backend,
"--api-key",
api_key,
]
if args.enable_auto_tool_choice:
cmd.append("--enable-auto-tool-choice")
print("Launching vLLM:")
print(" ".join(cmd))
try:
return subprocess.Popen(cmd)
except FileNotFoundError as e:
raise RuntimeError("vllm command not found. Activate an environment that has vllm installed.") from e
def stop_vllm(proc):
if proc and proc.poll() is None:
proc.terminate()
try:
proc.wait(timeout=10)
except subprocess.TimeoutExpired:
proc.kill()
def wait_vllm_ready(base_url, api_key, timeout_sec=180):
deadline = time.time() + timeout_sec
url = f"{base_url}/v1/models"
req = urllib.request.Request(url=url, headers={"Authorization": f"Bearer {api_key}"})
while time.time() < deadline:
try:
with urllib.request.urlopen(req, timeout=3) as resp:
if resp.status == 200:
return True
except urllib.error.URLError:
pass
time.sleep(1)
return False
def chat_once(base_url, model_name, messages, api_key):
payload = {"model": model_name, "messages": messages}
req = urllib.request.Request(
url=f"{base_url}/v1/chat/completions",
data=json.dumps(payload, ensure_ascii=False).encode("utf-8"),
headers={
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}",
},
method="POST",
)
with urllib.request.urlopen(req, timeout=600) as resp:
data = json.loads(resp.read().decode("utf-8"))
return data["choices"][0]["message"]
def chat_loop(base_url, model_name, api_key):
print("\n===== Chat Started =====\n")
messages = []
while True:
user_text = multiline_input()
if user_text is None:
break
messages.append({"role": "user", "content": user_text})
try:
assistant_msg = chat_once(base_url, model_name, messages, api_key)
except Exception as e:
print(f"\nRequest failed: {e}\n")
messages.pop()
continue
content = assistant_msg.get("content")
tool_calls = assistant_msg.get("tool_calls")
if content:
print(f"\nAssistant:\n{content}\n")
elif tool_calls:
print("\nAssistant(tool_calls):")
print(json.dumps(tool_calls, ensure_ascii=False, indent=2))
print()
else:
print("\nAssistant:\n(empty response)\n")
normalized_msg = {"role": "assistant", "content": content or ""}
if tool_calls:
normalized_msg["tool_calls"] = tool_calls
messages.append(normalized_msg)
def build_client_command(args):
cmd = [
sys.executable,
os.path.abspath(__file__),
"--_client",
"--model",
args.model,
"--served-model-name",
args.served_model_name,
"--host",
args.host,
"--port",
str(args.port),
"--max-model-len",
str(args.max_model_len),
"--tool-call-parser",
args.tool_call_parser,
"--attention-backend",
args.attention_backend,
"--enable-auto-tool-choice" if args.enable_auto_tool_choice else "--no-enable-auto-tool-choice",
]
return cmd
def spawn_chat_terminal(args, api_key):
client_cmd = build_client_command(args)
env = os.environ.copy()
env["VLLM_API_KEY"] = api_key
terminal_cmd = None
if os.name == "nt":
# Open a new cmd window on Windows and keep it alive for interactive chat.
terminal_cmd = [
"cmd",
"/c",
"start",
"",
"cmd",
"/k",
subprocess.list2cmdline(client_cmd),
]
elif shutil.which("gnome-terminal"):
terminal_cmd = ["gnome-terminal", "--", *client_cmd]
elif shutil.which("x-terminal-emulator"):
terminal_cmd = ["x-terminal-emulator", "-e", *client_cmd]
if not terminal_cmd:
return False
try:
subprocess.Popen(terminal_cmd, env=env)
return True
except Exception as e:
print(f"Failed to open a new terminal automatically: {e}")
return False
def parse_args():
parser = argparse.ArgumentParser(description="Minimal local vLLM chat script")
parser.add_argument("--_client", action="store_true", help=argparse.SUPPRESS)
parser.add_argument("--model", default="YCWTG/Qwen3-Coder-30B-A3B-Instruct-int4-mixed-AutoRound")
parser.add_argument(
"--served-model-name",
default="YCWTG/Qwen3-Coder-30B-A3B-Instruct-int4-mixed-AutoRound",
)
parser.add_argument("--host", default="0.0.0.0")
parser.add_argument("--port", type=int, default=8000)
parser.add_argument("--max-model-len", type=int, default=106666)
parser.add_argument(
"--enable-auto-tool-choice",
action=argparse.BooleanOptionalAction,
default=True,
)
parser.add_argument("--tool-call-parser", default="qwen3_coder")
parser.add_argument("--attention-backend", default="FLASHINFER")
return parser.parse_args()
def main():
args = parse_args()
api_key = os.environ.get("VLLM_API_KEY") or "local-dev-key"
base_url = f"http://{resolve_client_host(args.host)}:{args.port}"
if args._client:
chat_loop(base_url, args.served_model_name, api_key)
return
proc = launch_vllm(args, api_key)
atexit.register(stop_vllm, proc)
print(f"Waiting for service to become ready: {base_url}")
if not wait_vllm_ready(base_url, api_key):
print("vLLM startup timed out. Check server logs above.")
stop_vllm(proc)
sys.exit(1)
if spawn_chat_terminal(args, api_key):
print("Model is ready. Opened a new terminal for chat; this terminal keeps server logs.")
print("Press Ctrl+C here to stop vLLM.")
try:
proc.wait()
except KeyboardInterrupt:
print("\nInterrupted. Stopping vLLM...")
else:
print("No supported terminal found. Falling back to chat in this terminal.")
chat_loop(base_url, args.served_model_name, api_key)
if __name__ == "__main__":
main()Deploy This Model
Production-ready deployment in minutes
Together.ai
Instant API access to this model
Production-ready inference API. Start free, scale to millions.
Try Free APIReplicate
One-click model deployment
Run models in the cloud with simple API. No DevOps required.
Deploy NowDisclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.