praxis-bookwriter-r8-qwen2.5-14b-sft-lora
3
1
14.0B
license:cc-by-nc-4.0
by
maldv
Other
OTHER
14B params
New
3 downloads
Early-stage
Edge AI:
Mobile
Laptop
Server
32GB+ RAM
Mobile
Laptop
Server
Quick Summary
Model Card for praxis-bookwriter-r8-qwen2.
Device Compatibility
Mobile
4-6GB RAM
Laptop
16GB RAM
Server
GPU
Minimum Recommended
14GB+ RAM
Code Examples
Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Training Datapython
dtype = None
max_seq_length = 17920
load_in_4bit = True
rslora_rank = 8
output_dir = "outputs"
MODEL_NAME_TO_LOAD = "Qwen/Qwen2.5-14B-Instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_NAME_TO_LOAD,
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model, r = rslora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = 8, lora_dropout = 0.02, bias = "none",
rank_pattern = {"k_proj": 4, "down_proj": 4},
alpha_pattern = {"k_proj": 4, "down_proj": 4},
use_gradient_checkpointing = "unsloth", random_state = 3407,
use_rslora = True
)
targs = TrainingArguments(
per_device_train_batch_size = 2, gradient_accumulation_steps = 2,
learning_rate = 1.5e-4, weight_decay = 0.001, gradient_checkpointing = True,
max_grad_norm = 1.0, warmup_steps = 50, num_train_epochs = 3,
optim = "adamw_8bit", lr_scheduler_type = "cosine", seed = 3407,
fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
logging_steps = 1, per_device_eval_batch_size = 1, eval_strategy = "steps",
eval_steps = 25, save_strategy = "steps", save_steps = 10,
save_total_limit = 3, output_dir = output_dir,
report_to="wandb", remove_unused_columns=False,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer,
train_dataset=ds_train_sft, eval_dataset=ds_eval_sft,
max_seq_length=max_seq_length, packing=False, args=targs,
)Deploy This Model
Production-ready deployment in minutes
Together.ai
Instant API access to this model
Production-ready inference API. Start free, scale to millions.
Try Free APIReplicate
One-click model deployment
Run models in the cloud with simple API. No DevOps required.
Deploy NowDisclosure: We may earn a commission from these partners. This helps keep LLMYourWay free.