Qwen3-Coder-30B-A3B-Instruct-W4A16-mixed-AWQ
272
license:apache-2.0
by
YCWTG
Language Model
OTHER
30B params
New
272 downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
68GB+ RAM
Mobile
Laptop
Server
Quick Summary
AI model with specialized capabilities.
Device Compatibility
Mobile
4-6GB RAM
Laptop
16GB RAM
Server
GPU
Minimum Recommended
28GB+ RAM
Code Examples
Quickstartpythonvllm
import argparse
import atexit
import json
import os
import shutil
import subprocess
import sys
import time
import urllib.error
import urllib.request
def multiline_input():
print('User (type "END" on a single line to send, "exit" to quit):')
lines = []
while True:
line = input()
text = line.strip()
if text.lower() in {"exit", "quit"}:
return None
if text == "END":
break
lines.append(line)
return "\n".join(lines)
def resolve_client_host(host):
return "127.0.0.1" if host in {"0.0.0.0", "::"} else host
def launch_vllm(args, api_key):
cmd = [
"vllm",
"serve",
args.model,
"--served-model-name",
args.served_model_name,
"--host",
args.host,
"--port",
str(args.port),
"--max-model-len",
str(args.max_model_len),
"--tool-call-parser",
args.tool_call_parser,
"--attention-backend",
args.attention_backend,
"--api-key",
api_key,
]
if args.enable_auto_tool_choice:
cmd.append("--enable-auto-tool-choice")
print("Launching vLLM:")
print(" ".join(cmd))
try:
return subprocess.Popen(cmd)
except FileNotFoundError as e:
raise RuntimeError("vllm command not found. Activate an environment that has vllm installed.") from e
def stop_vllm(proc):
if proc and proc.poll() is None:
proc.terminate()
try:
proc.wait(timeout=10)
except subprocess.TimeoutExpired:
proc.kill()
def wait_vllm_ready(base_url, api_key, timeout_sec=180):
deadline = time.time() + timeout_sec
url = f"{base_url}/v1/models"
req = urllib.request.Request(url=url, headers={"Authorization": f"Bearer {api_key}"})
while time.time() < deadline:
try:
with urllib.request.urlopen(req, timeout=3) as resp:
if resp.status == 200:
return True
except urllib.error.URLError:
pass
time.sleep(1)
return False
def chat_once(base_url, model_name, messages, api_key):
payload = {"model": model_name, "messages": messages}
req = urllib.request.Request(
url=f"{base_url}/v1/chat/completions",
data=json.dumps(payload, ensure_ascii=False).encode("utf-8"),
headers={
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}",
},
method="POST",
)
with urllib.request.urlopen(req, timeout=600) as resp:
data = json.loads(resp.read().decode("utf-8"))
return data["choices"][0]["message"]
def chat_loop(base_url, model_name, api_key):
print("\n===== Chat Started =====\n")
messages = []
while True:
user_text = multiline_input()
if user_text is None:
break
messages.append({"role": "user", "content": user_text})
try:
assistant_msg = chat_once(base_url, model_name, messages, api_key)
except Exception as e:
print(f"\nRequest failed: {e}\n")
messages.pop()
continue
content = assistant_msg.get("content")
tool_calls = assistant_msg.get("tool_calls")
if content:
print(f"\nAssistant:\n{content}\n")
elif tool_calls:
print("\nAssistant(tool_calls):")
print(json.dumps(tool_calls, ensure_ascii=False, indent=2))
print()
else:
print("\nAssistant:\n(empty response)\n")
normalized_msg = {"role": "assistant", "content": content or ""}
if tool_calls:
normalized_msg["tool_calls"] = tool_calls
messages.append(normalized_msg)
def build_client_command(args):
cmd = [
sys.executable,
os.path.abspath(__file__),
"--_client",
"--model",
args.model,
"--served-model-name",
args.served_model_name,
"--host",
args.host,
"--port",
str(args.port),
"--max-model-len",
str(args.max_model_len),
"--tool-call-parser",
args.tool_call_parser,
"--attention-backend",
args.attention_backend,
"--enable-auto-tool-choice" if args.enable_auto_tool_choice else "--no-enable-auto-tool-choice",
]
return cmd
def spawn_chat_terminal(args, api_key):
client_cmd = build_client_command(args)
env = os.environ.copy()
env["VLLM_API_KEY"] = api_key
terminal_cmd = None
if os.name == "nt":
# Open a new cmd window on Windows and keep it alive for interactive chat.
terminal_cmd = [
"cmd",
"/c",
"start",
"",
"cmd",
"/k",
subprocess.list2cmdline(client_cmd),
]
elif shutil.which("gnome-terminal"):
terminal_cmd = ["gnome-terminal", "--", *client_cmd]
elif shutil.which("x-terminal-emulator"):
terminal_cmd = ["x-terminal-emulator", "-e", *client_cmd]
if not terminal_cmd:
return False
try:
subprocess.Popen(terminal_cmd, env=env)
return True
except Exception as e:
print(f"Failed to open a new terminal automatically: {e}")
return False
def parse_args():
parser = argparse.ArgumentParser(description="Minimal local vLLM chat script")
parser.add_argument("--_client", action="store_true", help=argparse.SUPPRESS)
parser.add_argument("--model", default="YCWTG/Qwen3-Coder-30B-A3B-Instruct-W4A16-mixed-AWQ")
parser.add_argument(
"--served-model-name",
default="YCWTG/Qwen3-Coder-30B-A3B-Instruct-W4A16-mixed-AWQ",
)
parser.add_argument("--host", default="0.0.0.0")
parser.add_argument("--port", type=int, default=8000)
parser.add_argument("--max-model-len", type=int, default=108888)
parser.add_argument(
"--enable-auto-tool-choice",
action=argparse.BooleanOptionalAction,
default=True,
)
parser.add_argument("--tool-call-parser", default="qwen3_coder")
parser.add_argument("--attention-backend", default="FLASHINFER")
return parser.parse_args()
def main():
args = parse_args()
api_key = os.environ.get("VLLM_API_KEY") or "local-dev-key"
base_url = f"http://{resolve_client_host(args.host)}:{args.port}"
if args._client:
chat_loop(base_url, args.served_model_name, api_key)
return
proc = launch_vllm(args, api_key)
atexit.register(stop_vllm, proc)
print(f"Waiting for service to become ready: {base_url}")
if not wait_vllm_ready(base_url, api_key):
print("vLLM startup timed out. Check server logs above.")
stop_vllm(proc)
sys.exit(1)
if spawn_chat_terminal(args, api_key):
print("Model is ready. Opened a new terminal for chat; this terminal keeps server logs.")
print("Press Ctrl+C here to stop vLLM.")
try:
proc.wait()
except KeyboardInterrupt:
print("\nInterrupted. Stopping vLLM...")
else:
print("No supported terminal found. Falling back to chat in this terminal.")
chat_loop(base_url, args.served_model_name, api_key)
if __name__ == "__main__":
main()bashvllm
vllm serve --model YCWTG/Qwen3-Coder-30B-A3B-Instruct-W4A16-mixed-AWQ --host localhost --port 8000 --max-model-len 108888 --enable-auto-tool-choice --tool-call-parser qwen3_coderGenerate the Modelpythontransformers
from transformers import AutoModelForCausalLM, AutoTokenizer
from llmcompressor import oneshot
from llmcompressor.modifiers.awq import AWQModifier
MODEL_ID = "Qwen/Qwen3-Coder-30B-A3B-Instruct"
SAVE_DIR = MODEL_ID.split("/")[-1] + "-W4A16-awq"
# Configure the quantization algorithm to run.
recipe = [
AWQModifier(
duo_scaling=False,
ignore=[
"lm_head",
"re:.*self_attn\\.(q_proj|k_proj|v_proj|o_proj)$",
"re:.*mlp\\.gate$",
],
scheme="W4A16",
targets=["Linear"],
),
]
# Select calibration dataset.
DATASET_ID = "code-search-net/code_search_net"
DATASET_SPLIT = "train"
# Select number of samples. 258 samples is a good place to start.
# Increasing the number of samples can improve accuracy.
NUM_CALIBRATION_SAMPLES = 258
MAX_SEQUENCE_LENGTH = 2048
def get_calib_dataset(tokenizer):
from datasets import load_dataset
ds = load_dataset(
DATASET_ID,
split=f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES*10}]",
)
def preprocess(example):
chat_messages = [
{"role": "user", "content": example["instruction"].strip()},
{"role": "assistant", "content": example["output"].strip()},
]
tokenized_messages = tokenizer.apply_chat_template(
chat_messages, tokenize=True
)
return {"input_ids": tokenized_messages}
ds = (
ds.shuffle(seed=42)
.map(preprocess, remove_columns=ds.column_names)
.select(range(NUM_CALIBRATION_SAMPLES))
)
return ds
if __name__ == "__main__":
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, torch_dtype="auto", trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
###
### Apply algorithms.
###
oneshot(
model=model,
dataset=get_calib_dataset(tokenizer),
recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
log_dir=None,
trust_remote_code_model=True,
)
model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)Deploy This Model
Production-ready deployment in minutes
Together.ai
Instant API access to this model
Production-ready inference API. Start free, scale to millions.
Try Free APIReplicate
One-click model deployment
Run models in the cloud with simple API. No DevOps required.
Deploy NowDisclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.