#Fine-tuning takes very long on H200 and 1B params

143 messages · Page 1 of 1 (latest)

fervent heath
#

it keeps showing 0.12 it/s

#

this is my code

#
%%capture
%uv pip uninstall unsloth unsloth_zoo trl transformers -y
import re
import torch; v = re.match(r"[0-9]{1,}\.[0-9]{1,}", str(torch.__version__)).group(0)
xformers = "xformers==" + ("0.0.33.post1" if v=="2.9" else "0.0.32.post2" if v=="2.8" else "0.0.29.post3")
%uv pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
%uv pip install sentencepiece protobuf "datasets==4.3.0" "huggingface_hub>=0.34.0" hf_transfer
%uv pip install --no-deps unsloth
%uv pip install transformers==4.56.2
%uv pip install --no-deps trl==0.22.2
%uv pip install wandb```
#
from unsloth import FastLanguageModel
import torch
max_seq_length = 4096 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "google/gemma-3-1b-it",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)```
#

output

:sloth: Unsloth: Will patch your computer to enable 2x faster free finetuning.
:sloth: Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2026.1.4: Fast Gemma3 patching. Transformers: 4.56.2.
   \\   /|    NVIDIA H200. Num GPUs = 1. Max memory: 139.801 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu129. CUDA: 9.0. CUDA Toolkit: 12.9. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Gemma3 does not support SDPA - switching to fast eager.
Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.```
#
model = FastLanguageModel.get_peft_model(
    model,
    r = 64, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 128,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)```
#
import os

# Load datasets from HuggingFace Hub
train_ds = load_dataset("Reubencf/konkani-gemma-train")
validate_ds = load_dataset("Reubencf/konkani-gemma-validate")

# Get the train split from each dataset - use the correct variable names!
train_dataset = train_ds["train"]
eval_dataset = validate_ds["train"]

os.environ["WANDB_API_KEY"] = ".................................................................."```
#
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    packing = True, # Can make training 5x faster for short sequences.
    args = SFTConfig(
        output_dir="./konkani-llama3.1-8b-instruct",
        hub_model_id="Reubencf/konkani-llama3.1-8b-instruct",  # Optional: custom repo name
        push_to_hub=True,  # Upload after each save
        hub_strategy="checkpoint", 
        num_train_epochs=2,
        save_steps=500,
        eval_steps=500,
        group_by_length = False,
        per_device_train_batch_size = 8,
        gradient_accumulation_steps = 8,
        learning_rate=2e-4,
        lr_scheduler_type="cosine",
        warmup_ratio=0.1,
        weight_decay=0.01,
        max_grad_norm=1.0,
        seed = 3407,
        bf16=True,
        dataloader_num_workers = 4,
        dataloader_pin_memory = True,
        report_to = "none", # Use TrackIO/WandB etc
        # Eval & logging
        logging_steps=100,
        logging_first_step = True,  
        eval_strategy="steps",            # Evaluate at the end of every epoch
        save_strategy="steps",            # Save at the end of every epoch
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
    ),
)```
#
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")```
#
2.102 GB of memory reserved.```
#
trainer_stats = trainer.train()```
thorny perch
#

please provide a sample dataset instead of Reubencf/konkani-gemma-train as that's gated/locked, you dont need to share your dataset make a minimu reproducible similar length

fervent heath
#

whats your hf username

thorny perch
fervent heath
thorny perch
#

hold on

#

14/2884 [01:54<5:31:05, 6.92s/it]

#

You are literally running 64 batch size on 4k tokens

#

I think I told you this like 5 times

#

You throttle the compute, that's the limit of your GPU

#

You also have packing enabled on top of that

#

Your GPU can only process so much tokens, it can't go higher

#

Regardless of how much VRAM you use

fervent heath
thorny perch
#

will check

fervent heath
#

it shows higher time

thorny perch
#

ah ok will check hold on

fervent heath
#

which GPU are u using by the way ?

thorny perch
#

h200

#

why is your hub_model_id konkani-llama3.1-8b-instruct" when you train and load gemma 3 btw?

thorny perch
#

and why does your dataset include chat template tokens?

#

{"text": "<start_of_turn>user\nYou are a knowle

#

.<end_of_turn>"}

fervent heath
#

and providing it to the model

thorny perch
#

what do you mean?

#

that's not how you should do it

fervent heath
thorny perch
#

I suspect that could be messing with your stuff

#

I'm not sure but I see that as an issue atleast even if it's not performance related

#

but if you load it as raw text I'm not sure

#

but regardless that's not how you should do it

#

your dataset should not contain template tokens

#

hold on.

fervent heath
fervent heath
thorny perch
fervent heath
#

ok

#

take your time

thorny perch
#

Does your dataset contain 1 turn each or multi turn?

fervent heath
#

1 turn each

#

instruction,system and response

#

just one turn

thorny perch
#

Yeah it seems that's the time it will take for your training

#

It's not an issue as far as I can see

#

the performance

#

Even if you hit 100% utilization, doesnt matter what batch size you use you still iterate 2 epochs

#

and you have 92K entries

#

that takes 5-6 hours or so

fervent heath
#

oh cause i was worried it was showing 0.1 it/s

#

i thought it should be faster

thorny perch
#

That it/s is faulty

fervent heath
#

and what about the chat template part

thorny perch
#

the time is correct

fervent heath
#

i no need to upload right like that

#

instead raw text and then format it

thorny perch
#

yo udont want to have tokens within dataset

fervent heath
#

but yeah ill stick to raw text

#

and then format it

#

in the code

thorny perch
#

Yeah

#

The it/s on your part is misleading

#

but the time is same

fervent heath
#

but otherwise everything is fine right ?

thorny perch
#

yes

#

5-6 hours more or less approx

#

for 2 epochs on that size

#

on that model

fervent heath
#

ok 👍

#

cause 8b takes around 10 hrs

#

so yeah

thorny perch
#

yeah

#

normal

#

just clean the data and run it properly , even if it works with hacky tokens

#

dont do that

#

that's it

fervent heath
#

thank you very much @thorny perch

thorny perch
#

gl hf!

#

close this thread if you consider it finished btw

#

or mark as solved rather

#

1143/92238 [04:29<6:12:22, 4.08it/s]

#

that's on 2 batch size

#

for insight, same perf

fervent heath
thorny perch
#

yeah just to test

#

the compute wont go faster than 100% of its capability regardless

#

so run with whatever batchsize that doesnt throttle you for osme reason

#

if its 5-6 hours that's normal

fervent heath
#

im worried because later on i will be trying higher params thats why

thorny perch
#

all good seems fine

#

I suggest you train on 5-10K entries max and see performance on your parameters, maybe even less 1-5K entries

#

once you decide parameters you scale up dataet entries

#

it's not productive to do parameter tuning on 5-6 hours runs

#

30min max.

fervent heath
thorny perch
#

even 1 hour depending on case

fervent heath
#

since im finetunine a low resource language

thorny perch
#

yeah in that case go higher on rank and enable rslora

fervent heath
thorny perch
#

run with for example 256 rank or 128, enable rslora, and set alpha to 16 or 32

fervent heath
#

64 is max

thorny perch
#

oh right

fervent heath
#

thats the catch

thorny perch
#

you can still enable rslora

#

set alpha to approx 16

#

try that

#

once

#

see if that helps

#

gl!

#

Remove my access from your HF btw

fervent heath
thorny perch
#

rslora scales alpha differently mathematically

#

with a rank of 64, alpha of 16 or even 8 is good

#

or somewhere between

fervent heath
thorny perch
#

you can run small scale and compare

#

2000 entries

#

compare rslora vs non rslora

fervent heath
#

ok

thorny perch
#

remember change alpha accordingly

fervent heath
#

ok thanks

fervent heath
#

@thorny perch i realized that i havent been using optimizer is admw_8bit good ?

#

or the normal admw ?

thorny perch
#

use 8bit as I mentioned earlier

#

if it fits in ur vram u can use regular optimizer if you prefer