wandb_api_key = os.getenv("WANDB_API_KEY")
max_seq_length = 3000
dtype = None
load_in_4bit = True
train_dataset = load_dataset("json", data_files="train_data.jsonl", split="train")
eval_dataset = load_dataset("json", data_files="eval_data.jsonl", split="train")
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = "mistralai/Mistral-7B-Instruct-v0.2",
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
# Initialize Weights & Biases
wandb.login(key=wandb_api_key)
run = wandb.init(project="Fine tuning mistral 7B", job_type="training", anonymous="allow")
model = FastLanguageModel.get_peft_model(
model,
r = 16,
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",],
lora_alpha = 16,
lora_dropout = 0,
bias = "none",
use_gradient_checkpointing = True,
random_state = 3407,
use_rslora = False,
loftq_config = None,
)
# Define training arguments
project = "Nbot-small"
base_model_name = "unsloth"
run_name = base_model_name + "-" + project
output_dir = "./" + run_name
trainer = SFTTrainer(
model = model,
tokenizer = tokenizer,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
dataset_text_field = "chat",
max_seq_length = max_seq_length,
dataset_num_proc = 2,
packing = False,
args = TrainingArguments(
per_device_train_batch_size = 2,
gradient_accumulation_steps = 4,
run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}",
warmup_steps = 50,
max_steps = 10000,
learning_rate = 2e-4,
fp16 = not torch.cuda.is_bf16_supported(),
bf16 = torch.cuda.is_bf16_supported(),
fp16_full_eval = True,
save_strategy="epoch",
evaluation_strategy="epoch",
# save_steps=250,
# eval_steps=250,
do_eval=True,
logging_steps = 25,
optim = "adamw_8bit",
weight_decay = 0.01,
lr_scheduler_type = "linear",
seed = 3407,
output_dir = output_dir,
report_to="wandb",
logging_dir="./logs",
),
)
trainer_stats = trainer.train()
model.save_pretrained("lora_model")
Example dataset row (see message bellow they are to long for this message ;P:
Hey I'm trying to to fine tune misteral to be a better ReAct, RAG agent with about 5000 rows of data, using the normal SFTTrainer following huggingfaces example works fine even with higher VRAM and lower speed however after the 657 th row this failed and I'm not sure why it seems related to the fact it was at epoch 0.99 just before my evaluation. Any insight would be very useful as I'd love to use Unsloth for training so my PC is actually usable whilst training and so I can iterate and tune my parameters fast. Is it the fact that my database is already following the Mistral chat template and I'm loading it directly? Many Thanks, Volts.

True