Qwen3-4B-int8-int4-unsloth-v3

Name: Qwen3-4B-int8-int4-unsloth-v3
Author: metascroy

license:apache-2.0

metascroy

Language Model

OTHER

4B params

New

58 downloads

Early-stage

Try on Hugging Face Add to Compare

Edge AI:

Mobile

Laptop

Server

9GB+ RAM

Mobile

Laptop

Server

Quick Summary

AI model with specialized capabilities.

Device Compatibility

Mobile

4-6GB RAM

Laptop

16GB RAM

Server

GPU

Minimum Recommended

4GB+ RAM

Code Examples

Finetune with unsloth and torchaopythontransformers

################################################################################
# We first load the model for QAT using the mobile CPU friendly int8-int4 scheme
################################################################################

from unsloth import FastLanguageModel
from unsloth.chat_templates import (
    get_chat_template,
)
import torch

MODEL_ID = "unsloth/Qwen3-4B"
QAT_SCHEME = "int8-int4"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_ID,
    max_seq_length = 2048,
    dtype = torch.bfloat16,
    load_in_4bit = False,
    full_finetuning = True,
    # ExecuTorch CPU quantization scheme
    # Quantize embedding to 8-bits, and quantize linear layers to 4-bits
    # with 8-bit dynamically quantized activations
    qat_scheme = QAT_SCHEME,
)
tokenizer = get_chat_template(tokenizer, chat_template = "qwen3")


################################################################################
# Data prep
################################################################################

from datasets import load_dataset
reasoning_dataset = load_dataset("unsloth/OpenMathReasoning-mini", split = "cot")
non_reasoning_dataset = load_dataset("mlabonne/FineTome-100k", split = "train")

# Convert the dataset into a conversational format
def generate_conversation(examples):
    problems  = examples["problem"]
    solutions = examples["generated_solution"]
    conversations = []
    for problem, solution in zip(problems, solutions):
        conversations.append([
            {"role" : "user",      "content" : problem},
            {"role" : "assistant", "content" : solution},
        ])
    return { "conversations": conversations, }

reasoning_conversations = tokenizer.apply_chat_template(
    list(reasoning_dataset.map(generate_conversation, batched = True)["conversations"]),
    tokenize = False,
)

from unsloth.chat_templates import standardize_sharegpt
dataset = standardize_sharegpt(non_reasoning_dataset)
non_reasoning_conversations = tokenizer.apply_chat_template(
    list(dataset["conversations"]),
    tokenize = False,
)

# Let's create a combined dataset that mixes 25% conversational vs. 75% reasoning
chat_percentage = 0.25
import pandas as pd
non_reasoning_subset = pd.Series(non_reasoning_conversations)
non_reasoning_subset = non_reasoning_subset.sample(
    int(len(reasoning_conversations)*(chat_percentage/(1 - chat_percentage))),
    random_state=2407,
)
print(len(reasoning_conversations))
print(len(non_reasoning_subset))
print(len(non_reasoning_subset) / (len(non_reasoning_subset) + len(reasoning_conversations)))


data = pd.concat([
    pd.Series(reasoning_conversations),
    pd.Series(non_reasoning_subset)
])
data.name = "text"

from datasets import Dataset
combined_dataset = Dataset.from_pandas(pd.DataFrame(data))
combined_dataset = combined_dataset.shuffle(seed = 3407)


################################################################################
# Define trainer
################################################################################

from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = combined_dataset,
    eval_dataset = None, # Can set up evaluation!
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4, # Use GA to mimic batch size!
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 30,
        learning_rate = 2e-5,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.001,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "none", # Use TrackIO/WandB etc
    ),
)


################################################################################
# Do fine tuning
################################################################################
print("DOING FINETUNING")
trainer_stats = trainer.train()
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)


################################################################################
# Inference
################################################################################
messages = [
    {"role" : "user", "content" : "Solve (x + 2)^2 = 0."}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize = False,
    add_generation_prompt = True, # Must add for generation
    enable_thinking = False, # Disable thinking
)

from transformers import TextStreamer
_ = model.generate(
    **tokenizer(text, return_tensors = "pt").to("cuda"),
    max_new_tokens = 256, # Increase for longer outputs!
    temperature = 0.7, top_p = 0.8, top_k = 20, # For non thinking
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)


# ################################################################################
# # Convert model to torchao format and save
# ################################################################################

from unsloth.models._utils import _convert_torchao_model
_convert_torchao_model(model)

model_name = MODEL_ID.split("/")[-1]
save_to = f"{model_name}-{QAT_SCHEME}-unsloth-v3"

# Save locally
# model.save_pretrained(save_to, safe_serialization=False)
# tokenizer.save_pretrained(save_to)

# Or save to hub
from huggingface_hub import get_token, whoami
def _get_username():
    token = get_token()
    username = whoami(token=token)["name"]
    return username
username = _get_username()
model.push_to_hub(f"{username}/{save_to}", safe_serialization=False)
tokenizer.push_to_hub(f"{username}/{save_to}")

Deploy This Model

Production-ready deployment in minutes

Together.ai

Instant API access to this model

Fastest API

Production-ready inference API. Start free, scale to millions.

Try Free API

Replicate

One-click model deployment

Easiest Setup

Run models in the cloud with simple API. No DevOps required.

Deploy Now

Disclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.