Qwen3-4B-int8-int4-unsloth-v3
58
license:apache-2.0
by
metascroy
Language Model
OTHER
4B params
New
58 downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
9GB+ RAM
Mobile
Laptop
Server
Quick Summary
AI model with specialized capabilities.
Device Compatibility
Mobile
4-6GB RAM
Laptop
16GB RAM
Server
GPU
Minimum Recommended
4GB+ RAM
Code Examples
Finetune with unsloth and torchaopythontransformers
################################################################################
# We first load the model for QAT using the mobile CPU friendly int8-int4 scheme
################################################################################
from unsloth import FastLanguageModel
from unsloth.chat_templates import (
get_chat_template,
)
import torch
MODEL_ID = "unsloth/Qwen3-4B"
QAT_SCHEME = "int8-int4"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_ID,
max_seq_length = 2048,
dtype = torch.bfloat16,
load_in_4bit = False,
full_finetuning = True,
# ExecuTorch CPU quantization scheme
# Quantize embedding to 8-bits, and quantize linear layers to 4-bits
# with 8-bit dynamically quantized activations
qat_scheme = QAT_SCHEME,
)
tokenizer = get_chat_template(tokenizer, chat_template = "qwen3")
################################################################################
# Data prep
################################################################################
from datasets import load_dataset
reasoning_dataset = load_dataset("unsloth/OpenMathReasoning-mini", split = "cot")
non_reasoning_dataset = load_dataset("mlabonne/FineTome-100k", split = "train")
# Convert the dataset into a conversational format
def generate_conversation(examples):
problems = examples["problem"]
solutions = examples["generated_solution"]
conversations = []
for problem, solution in zip(problems, solutions):
conversations.append([
{"role" : "user", "content" : problem},
{"role" : "assistant", "content" : solution},
])
return { "conversations": conversations, }
reasoning_conversations = tokenizer.apply_chat_template(
list(reasoning_dataset.map(generate_conversation, batched = True)["conversations"]),
tokenize = False,
)
from unsloth.chat_templates import standardize_sharegpt
dataset = standardize_sharegpt(non_reasoning_dataset)
non_reasoning_conversations = tokenizer.apply_chat_template(
list(dataset["conversations"]),
tokenize = False,
)
# Let's create a combined dataset that mixes 25% conversational vs. 75% reasoning
chat_percentage = 0.25
import pandas as pd
non_reasoning_subset = pd.Series(non_reasoning_conversations)
non_reasoning_subset = non_reasoning_subset.sample(
int(len(reasoning_conversations)*(chat_percentage/(1 - chat_percentage))),
random_state=2407,
)
print(len(reasoning_conversations))
print(len(non_reasoning_subset))
print(len(non_reasoning_subset) / (len(non_reasoning_subset) + len(reasoning_conversations)))
data = pd.concat([
pd.Series(reasoning_conversations),
pd.Series(non_reasoning_subset)
])
data.name = "text"
from datasets import Dataset
combined_dataset = Dataset.from_pandas(pd.DataFrame(data))
combined_dataset = combined_dataset.shuffle(seed = 3407)
################################################################################
# Define trainer
################################################################################
from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
model = model,
tokenizer = tokenizer,
train_dataset = combined_dataset,
eval_dataset = None, # Can set up evaluation!
args = SFTConfig(
dataset_text_field = "text",
per_device_train_batch_size = 2,
gradient_accumulation_steps = 4, # Use GA to mimic batch size!
warmup_steps = 5,
# num_train_epochs = 1, # Set this for 1 full training run.
max_steps = 30,
learning_rate = 2e-5,
logging_steps = 1,
optim = "adamw_8bit",
weight_decay = 0.001,
lr_scheduler_type = "linear",
seed = 3407,
report_to = "none", # Use TrackIO/WandB etc
),
)
################################################################################
# Do fine tuning
################################################################################
print("DOING FINETUNING")
trainer_stats = trainer.train()
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
################################################################################
# Inference
################################################################################
messages = [
{"role" : "user", "content" : "Solve (x + 2)^2 = 0."}
]
text = tokenizer.apply_chat_template(
messages,
tokenize = False,
add_generation_prompt = True, # Must add for generation
enable_thinking = False, # Disable thinking
)
from transformers import TextStreamer
_ = model.generate(
**tokenizer(text, return_tensors = "pt").to("cuda"),
max_new_tokens = 256, # Increase for longer outputs!
temperature = 0.7, top_p = 0.8, top_k = 20, # For non thinking
streamer = TextStreamer(tokenizer, skip_prompt = True),
)
# ################################################################################
# # Convert model to torchao format and save
# ################################################################################
from unsloth.models._utils import _convert_torchao_model
_convert_torchao_model(model)
model_name = MODEL_ID.split("/")[-1]
save_to = f"{model_name}-{QAT_SCHEME}-unsloth-v3"
# Save locally
# model.save_pretrained(save_to, safe_serialization=False)
# tokenizer.save_pretrained(save_to)
# Or save to hub
from huggingface_hub import get_token, whoami
def _get_username():
token = get_token()
username = whoami(token=token)["name"]
return username
username = _get_username()
model.push_to_hub(f"{username}/{save_to}", safe_serialization=False)
tokenizer.push_to_hub(f"{username}/{save_to}")Deploy This Model
Production-ready deployment in minutes
Together.ai
Instant API access to this model
Production-ready inference API. Start free, scale to millions.
Try Free APIReplicate
One-click model deployment
Run models in the cloud with simple API. No DevOps required.
Deploy NowDisclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.