Hi guys , I am facing an issue where I am finetuning the llama 8B model via same code used by sloth AI on google colab and when I use the trained model for inference then it's giving accurate output on google colab and when I save the model locally and then load the local model for inference then the results are very erratic , like I don't understand where I am going wrong.
Following is the code that I have used to load the locally saved finetuned model (It was stored using Q8 quantization):-
from unsloth import FastLanguageModel
alpaca_prompt = """
{}"""
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = "FinetunedBR", # Tried loading both from safe tensors and Lora adapters
max_seq_length = max_seq_length,
dtype = dtype,
load_in_4bit = load_in_4bit,
)
model = FastLanguageModel.get_peft_model(
model,
r = 16,
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",],
lora_alpha = 16,
lora_dropout = 0,
bias = "none",
use_gradient_checkpointing = "unsloth",
random_state = 3407,
use_rslora = False,
loftq_config = None,
)
FastLanguageModel.for_inference(model)
def generate_code(instruction):
inputs = tokenizer(
[
alpaca_prompt.format(
"Write a business rule to iterate skproduct variable.", # instruction
"", # input
"", # output - leave this blank for generation!
)
], return_tensors = "pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
generated_business_Rule = tokenizer.batch_decode(outputs)
return generated_business_Rule