Qwen3-1.77B-g023
586
license:apache-2.0
by
g023
Language Model
OTHER
1.77B params
New
586 downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
4GB+ RAM
Mobile
Laptop
Server
Quick Summary
AI model with specialized capabilities.
Device Compatibility
Mobile
4-6GB RAM
Laptop
16GB RAM
Server
GPU
Minimum Recommended
2GB+ RAM
Code Examples
Usagepythontransformers
# Tweakable parameters
# MODEL_PATH = "./Qwen3-BEST" # local run
MODEL_PATH = "g023/Qwen3-1.77B-g023"
MAX_NEW_TOKENS = 8192
TEMPERATURE = 0.7
DO_SAMPLE = True
TOP_P = 0.9
TOP_K = 50
REPETITION_PENALTY = 1.1
STREAMING = True # Set to True for streaming inference
INPUT_MESSAGE = "You are completing the next step in a task to create an arcade game in javascript. Your available tools are rationalize, red_green_tdd, and create_plan. Synthesize their output when reasoning. "
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
import time
def load_model():
print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
MODEL_PATH,
device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
print("Model loaded.")
return model, tokenizer
def inference_non_streaming(model, tokenizer, messages):
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=True)
inputs = tokenizer(text, return_tensors="pt").to(model.device)
outputs = model.generate(
**inputs,
max_new_tokens=MAX_NEW_TOKENS,
temperature=TEMPERATURE,
do_sample=DO_SAMPLE,
top_p=TOP_P,
top_k=TOP_K,
repetition_penalty=REPETITION_PENALTY,
)
response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
print("Response:", response)
return response
def inference_streaming(model, tokenizer, messages):
final_response = ""
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=True)
inputs = tokenizer(text, return_tensors="pt").to(model.device)
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
outputs = model.generate(
**inputs,
max_new_tokens=MAX_NEW_TOKENS,
temperature=TEMPERATURE,
do_sample=DO_SAMPLE,
top_p=TOP_P,
top_k=TOP_K,
repetition_penalty=REPETITION_PENALTY,
streamer=streamer,
)
# return a final str
return final_response
def llm_stream(model, tokenizer, conversation):
import time
start_time = time.time()
text = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True, enable_thinking=True)
inputs = tokenizer(text, return_tensors="pt").to(model.device)
from io import StringIO
buffer = StringIO()
class CapturingTextStreamer(TextStreamer):
def __init__(self, tokenizer, buffer):
super().__init__(tokenizer, skip_prompt=True, skip_special_tokens=True)
self.buffer = buffer
def on_finalized_text(self, text, stream_end=False):
self.buffer.write(text)
print(text, end="", flush=True)
streamer = CapturingTextStreamer(tokenizer, buffer)
outputs = model.generate(
**inputs,
max_new_tokens=MAX_NEW_TOKENS,
temperature=TEMPERATURE,
do_sample=DO_SAMPLE,
top_p=TOP_P,
top_k=TOP_K,
repetition_penalty=REPETITION_PENALTY,
streamer=streamer,
)
response = buffer.getvalue()
if "</think>" in response:
parts = response.rsplit("</think>", 1)
reasoning = parts[0].strip()
content = parts[1].strip()
else:
reasoning = ""
content = response.strip()
char_per_token = 3.245
reasoning_tokens = round(len(reasoning) / char_per_token)
content_tokens = round(len(content) / char_per_token)
total_tokens = reasoning_tokens + content_tokens
time_taken = time.time() - start_time
ret_dict = {
"reasoning": reasoning,
"content": content,
"usage": {
"reasoning_tokens": reasoning_tokens,
"content_tokens": content_tokens,
"total_tokens": total_tokens,
},
"time_taken": time_taken,
}
return ret_dict
if __name__ == "__main__":
model, tokenizer = load_model()
messages = [{"role": "user", "content": INPUT_MESSAGE}]
ret = llm_stream(model, tokenizer, messages)
print("Result dict:", ret)
# output tokens per second by taking total_tokens and time_taken
if ret["usage"]["total_tokens"] > 0 and ret["time_taken"] > 0:
tps = ret["usage"]["total_tokens"] / ret["time_taken"]
print(f"Tokens per second: {tps:.2f}")Deploy This Model
Production-ready deployment in minutes
Together.ai
Instant API access to this model
Production-ready inference API. Start free, scale to millions.
Try Free APIReplicate
One-click model deployment
Run models in the cloud with simple API. No DevOps required.
Deploy NowDisclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.