praxis-bookwriter-r8-qwen2.5-14b-sft-lora

3
1
14.0B
license:cc-by-nc-4.0
by
maldv
Other
OTHER
14B params
New
3 downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
32GB+ RAM
Mobile
Laptop
Server
Quick Summary

Model Card for praxis-bookwriter-r8-qwen2.

Device Compatibility

Mobile
4-6GB RAM
Laptop
16GB RAM
Server
GPU
Minimum Recommended
14GB+ RAM

Code Examples

Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"

MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_NAME_TO_LOAD,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model, r = rslora_rank,
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 8, lora_dropout = 0.02, bias = "none",
    rank_pattern     = {"k_proj": 4, "down_proj": 4},
    alpha_pattern    = {"k_proj": 4, "down_proj": 4},
    use_gradient_checkpointing = "unsloth", random_state = 3407,
    use_rslora = True
)

targs = TrainingArguments(
    per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
    learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
    max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
    optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
    fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
    logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
    eval_steps = 25, save_strategy = "steps", save_steps = 10,
    save_total_limit = 3, output_dir = output_dir,
    report_to="wandb", remove_unused_columns=False,
)

trainer = SFTTrainer(
    model=model, tokenizer=tokenizer,
    train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
    max_seq_length=max_seq_length, packing=False, args=targs,
)

Deploy This Model

Production-ready deployment in minutes

Together.ai

Instant API access to this model

Fastest API

Production-ready inference API. Start free, scale to millions.

Try Free API

Replicate

One-click model deployment

Easiest Setup

Run models in the cloud with simple API. No DevOps required.

Deploy Now

Disclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.