Qwen3-1.77B-g023

586
license:apache-2.0
by
g023
Language Model
OTHER
1.77B params
New
586 downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
4GB+ RAM
Mobile
Laptop
Server
Quick Summary

AI model with specialized capabilities.

Device Compatibility

Mobile
4-6GB RAM
Laptop
16GB RAM
Server
GPU
Minimum Recommended
2GB+ RAM

Code Examples

Usagepythontransformers
# Tweakable parameters
# MODEL_PATH = "./Qwen3-BEST" # local run
MODEL_PATH = "g023/Qwen3-1.77B-g023"
MAX_NEW_TOKENS = 8192
TEMPERATURE = 0.7
DO_SAMPLE = True
TOP_P = 0.9
TOP_K = 50
REPETITION_PENALTY = 1.1
STREAMING = True  # Set to True for streaming inference
INPUT_MESSAGE = "You are completing the next step in a task to create an arcade game in javascript. Your available tools are rationalize, red_green_tdd, and create_plan. Synthesize their output when reasoning. "

from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
import time

def load_model():
    print("Loading model...")
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_PATH,
        device_map="auto",
    )
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    print("Model loaded.")
    return model, tokenizer

def inference_non_streaming(model, tokenizer, messages):
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=True)
    inputs = tokenizer(text, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=MAX_NEW_TOKENS,
        temperature=TEMPERATURE,
        do_sample=DO_SAMPLE,
        top_p=TOP_P,
        top_k=TOP_K,
        repetition_penalty=REPETITION_PENALTY,
    )
    response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
    print("Response:", response)
    return response

def inference_streaming(model, tokenizer, messages):
    final_response = ""
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=True)
    inputs = tokenizer(text, return_tensors="pt").to(model.device)
    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    outputs = model.generate(
        **inputs,
        max_new_tokens=MAX_NEW_TOKENS,
        temperature=TEMPERATURE,
        do_sample=DO_SAMPLE,
        top_p=TOP_P,
        top_k=TOP_K,
        repetition_penalty=REPETITION_PENALTY,
        streamer=streamer,
    )


    # return a final str
    return final_response

def llm_stream(model, tokenizer, conversation):
    import time
    start_time = time.time()
    text = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True, enable_thinking=True)
    inputs = tokenizer(text, return_tensors="pt").to(model.device)
    from io import StringIO
    buffer = StringIO()
    class CapturingTextStreamer(TextStreamer):
        def __init__(self, tokenizer, buffer):
            super().__init__(tokenizer, skip_prompt=True, skip_special_tokens=True)
            self.buffer = buffer
        def on_finalized_text(self, text, stream_end=False):
            self.buffer.write(text)
            print(text, end="", flush=True)
    streamer = CapturingTextStreamer(tokenizer, buffer)
    outputs = model.generate(
        **inputs,
        max_new_tokens=MAX_NEW_TOKENS,
        temperature=TEMPERATURE,
        do_sample=DO_SAMPLE,
        top_p=TOP_P,
        top_k=TOP_K,
        repetition_penalty=REPETITION_PENALTY,
        streamer=streamer,
    )
    response = buffer.getvalue()

    if "</think>" in response:
        parts = response.rsplit("</think>", 1)
        reasoning = parts[0].strip()
        content = parts[1].strip()
    else:
        reasoning = ""
        content = response.strip()
    char_per_token = 3.245
    reasoning_tokens = round(len(reasoning) / char_per_token)
    content_tokens = round(len(content) / char_per_token)
    total_tokens = reasoning_tokens + content_tokens
    time_taken = time.time() - start_time
    ret_dict = {
        "reasoning": reasoning,
        "content": content,
        "usage": {
            "reasoning_tokens": reasoning_tokens,
            "content_tokens": content_tokens,
            "total_tokens": total_tokens,
        },
        "time_taken": time_taken,
    }
    return ret_dict

if __name__ == "__main__":
    model, tokenizer = load_model()
    messages = [{"role": "user", "content": INPUT_MESSAGE}]
    ret = llm_stream(model, tokenizer, messages)
    print("Result dict:", ret)

    # output tokens per second by taking total_tokens and time_taken
    if ret["usage"]["total_tokens"] > 0 and ret["time_taken"] > 0:
        tps = ret["usage"]["total_tokens"] / ret["time_taken"]
        print(f"Tokens per second: {tps:.2f}")

Deploy This Model

Production-ready deployment in minutes

Together.ai

Instant API access to this model

Fastest API

Production-ready inference API. Start free, scale to millions.

Try Free API

Replicate

One-click model deployment

Easiest Setup

Run models in the cloud with simple API. No DevOps required.

Deploy Now

Disclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.