Qwen3.5-35B-A3B-int4-mixed-AutoRound
12
license:apache-2.0
by
YCWTG
Language Model
OTHER
35B params
New
12 downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
79GB+ RAM
Mobile
Laptop
Server
Quick Summary
AI model with specialized capabilities.
Device Compatibility
Mobile
4-6GB RAM
Laptop
16GB RAM
Server
GPU
Minimum Recommended
33GB+ RAM
Code Examples
Quickstartpythonvllm
import argparse
import atexit
import json
import os
import shutil
import subprocess
import sys
import time
import urllib.error
import urllib.request
# ---------------------------
# User-facing configuration
# ---------------------------
DEFAULTS = {
"model": "YCWTG/Qwen3.5-35B-A3B-int4-mixed-AutoRound",
"served_model_name": "YCWTG/Qwen3.5-35B-A3B-int4-mixed-AutoRound",
"host": "0.0.0.0",
"port": 8000,
"max_model_len": 262144,
"enable_auto_tool_choice": True,
"tool_call_parser": "qwen3_coder",
"reasoning_parser": "qwen3",
"attention_backend": "FLASHINFER",
"language_model_only": False,
}
RUNTIME = {
"gpu_memory_utilization": 0.85,
"startup_timeout_sec": 180,
"healthcheck_timeout_sec": 3,
"healthcheck_interval_sec": 1,
"chat_timeout_sec": 600,
}
SERVE_VALUE_OPTIONS = (
("--served-model-name", "served_model_name"),
("--host", "host"),
("--port", "port"),
("--max-model-len", "max_model_len"),
("--tool-call-parser", "tool_call_parser"),
("--reasoning-parser", "reasoning_parser"),
("--attention-backend", "attention_backend"),
)
CLIENT_VALUE_OPTIONS = (
("--model", "model"),
*SERVE_VALUE_OPTIONS,
)
SERVE_BOOL_OPTIONS = (
("--enable-auto-tool-choice", "enable_auto_tool_choice"),
("--language-model-only", "language_model_only"),
)
CLIENT_BOOL_OPTIONS = (
("--language-model-only", "--no-language-model-only", "language_model_only"),
("--enable-auto-tool-choice", "--no-enable-auto-tool-choice", "enable_auto_tool_choice"),
)
def append_value_options(cmd, args, options):
for flag, attr in options:
cmd.extend([flag, str(getattr(args, attr))])
def append_true_bool_options(cmd, args, options):
for flag, attr in options:
if getattr(args, attr):
cmd.append(flag)
def append_boolean_optional_options(cmd, args, options):
for positive_flag, negative_flag, attr in options:
cmd.append(positive_flag if getattr(args, attr) else negative_flag)
def multiline_input():
print('User (type "END" on a single line to send, "exit" to quit):')
lines = []
while True:
line = input()
text = line.strip()
if text.lower() in {"exit", "quit"}:
return None
if text == "END":
break
lines.append(line)
return "\n".join(lines)
def resolve_client_host(host):
return "127.0.0.1" if host in {"0.0.0.0", "::"} else host
def launch_vllm(args):
cmd = ["vllm", "serve", args.model]
append_value_options(cmd, args, SERVE_VALUE_OPTIONS)
cmd.extend(
[
"--enforce-eager",
"--gpu-memory-utilization",
str(RUNTIME["gpu_memory_utilization"]),
]
)
append_true_bool_options(cmd, args, SERVE_BOOL_OPTIONS)
print("Launching vLLM:")
print(" ".join(cmd))
try:
return subprocess.Popen(cmd)
except FileNotFoundError as e:
raise RuntimeError("vllm command not found. Activate an environment that has vllm installed.") from e
def stop_vllm(proc):
if proc and proc.poll() is None:
proc.terminate()
try:
proc.wait(timeout=10)
except subprocess.TimeoutExpired:
proc.kill()
def wait_vllm_ready(base_url, timeout_sec=RUNTIME["startup_timeout_sec"]):
deadline = time.time() + timeout_sec
url = f"{base_url}/v1/models"
req = urllib.request.Request(url=url)
while time.time() < deadline:
try:
with urllib.request.urlopen(req, timeout=RUNTIME["healthcheck_timeout_sec"]) as resp:
if resp.status == 200:
return True
except urllib.error.URLError:
pass
time.sleep(RUNTIME["healthcheck_interval_sec"])
return False
def chat_once(base_url, model_name, messages):
payload = {"model": model_name, "messages": messages}
req = urllib.request.Request(
url=f"{base_url}/v1/chat/completions",
data=json.dumps(payload, ensure_ascii=False).encode("utf-8"),
headers={"Content-Type": "application/json"},
method="POST",
)
with urllib.request.urlopen(req, timeout=RUNTIME["chat_timeout_sec"]) as resp:
data = json.loads(resp.read().decode("utf-8"))
return data["choices"][0]["message"]
def chat_loop(base_url, model_name):
print("\n===== Chat Started =====\n")
messages = []
while True:
user_text = multiline_input()
if user_text is None:
break
messages.append({"role": "user", "content": user_text})
try:
assistant_msg = chat_once(base_url, model_name, messages)
except Exception as e:
print(f"\nRequest failed: {e}\n")
messages.pop()
continue
content = assistant_msg.get("content")
tool_calls = assistant_msg.get("tool_calls")
if content:
print(f"\nAssistant:\n{content}\n")
elif tool_calls:
print("\nAssistant(tool_calls):")
print(json.dumps(tool_calls, ensure_ascii=False, indent=2))
print()
else:
print("\nAssistant:\n(empty response)\n")
normalized_msg = {"role": "assistant", "content": content or ""}
if tool_calls:
normalized_msg["tool_calls"] = tool_calls
messages.append(normalized_msg)
def build_client_command(args):
cmd = [sys.executable, os.path.abspath(__file__), "--_client"]
append_value_options(cmd, args, CLIENT_VALUE_OPTIONS)
append_boolean_optional_options(cmd, args, CLIENT_BOOL_OPTIONS)
return cmd
def spawn_chat_terminal(args):
client_cmd = build_client_command(args)
terminal_cmd = None
if os.name == "nt":
# Open a new cmd window on Windows and keep it alive for interactive chat.
terminal_cmd = [
"cmd",
"/c",
"start",
"",
"cmd",
"/k",
subprocess.list2cmdline(client_cmd),
]
elif shutil.which("gnome-terminal"):
terminal_cmd = ["gnome-terminal", "--", *client_cmd]
elif shutil.which("x-terminal-emulator"):
terminal_cmd = ["x-terminal-emulator", "-e", *client_cmd]
if not terminal_cmd:
return False
try:
subprocess.Popen(terminal_cmd)
return True
except Exception as e:
print(f"Failed to open a new terminal automatically: {e}")
return False
def parse_args():
parser = argparse.ArgumentParser(description="Minimal local vLLM chat script")
parser.add_argument("--_client", action="store_true", help=argparse.SUPPRESS)
parser.add_argument("--model", default=DEFAULTS["model"])
parser.add_argument(
"--served-model-name",
default=DEFAULTS["served_model_name"],
)
parser.add_argument("--host", default=DEFAULTS["host"])
parser.add_argument("--port", type=int, default=DEFAULTS["port"])
parser.add_argument("--max-model-len", type=int, default=DEFAULTS["max_model_len"])
parser.add_argument(
"--enable-auto-tool-choice",
action=argparse.BooleanOptionalAction,
default=DEFAULTS["enable_auto_tool_choice"],
)
parser.add_argument("--tool-call-parser", default=DEFAULTS["tool_call_parser"])
parser.add_argument("--reasoning-parser", default=DEFAULTS["reasoning_parser"])
parser.add_argument("--attention-backend", default=DEFAULTS["attention_backend"])
parser.add_argument(
"--language-model-only",
action=argparse.BooleanOptionalAction,
default=DEFAULTS["language_model_only"],
)
return parser.parse_args()
def main():
args = parse_args()
base_url = f"http://{resolve_client_host(args.host)}:{args.port}"
if args._client:
chat_loop(base_url, args.served_model_name)
return
proc = launch_vllm(args)
atexit.register(stop_vllm, proc)
print(f"Waiting for service to become ready: {base_url}")
if not wait_vllm_ready(base_url):
print("vLLM startup timed out. Check server logs above.")
stop_vllm(proc)
sys.exit(1)
if spawn_chat_terminal(args):
print("Model is ready. Opened a new terminal for chat; this terminal keeps server logs.")
print("Press Ctrl+C here to stop vLLM.")
try:
proc.wait()
except KeyboardInterrupt:
print("\nInterrupted. Stopping vLLM...")
else:
print("No supported terminal found. Falling back to chat in this terminal.")
chat_loop(base_url, args.served_model_name)
if __name__ == "__main__":
main()Deploy This Model
Production-ready deployment in minutes
Together.ai
Instant API access to this model
Production-ready inference API. Start free, scale to millions.
Try Free APIReplicate
One-click model deployment
Run models in the cloud with simple API. No DevOps required.
Deploy NowDisclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.