from unsloth import FastLanguageModel
from transformers import TextIteratorStreamer
import threading
# Load base model
base_model, tokenizer = FastLanguageModel.from_pretrained(
model_name="unsloth/gemma-3-4b-it-unsloth-bnb-4bit", # <-- Replace with your base model (it will be in your adapter_config.json)
max_seq_length=2048,
dtype=torch.float16,
load_in_4bit=True,
)
# Peft for loading LoRA
from peft import PeftModel
lora_model = PeftModel.from_pretrained(base_model, "tieubaoca/gemma-3-4b-it-unsloth-bnb-4bit-finetune-vi-alpaca-lora") # <-- Replace with your Lora
FastLanguageModel.for_inference(lora_model)
# Chat template
def generate_streaming(model, tokenizer, message):
messages = [{
"role": "user",
"content": [{"type": "text", "text": message}]
}]
inputs = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
return_tensors="pt",
tokenize=True,
return_dict=True,
).to("cuda")
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
# Sampler settings
generation_kwargs = dict(
**inputs,
streamer=streamer,
max_new_tokens=256,
temperature=0.7,
top_p=0.9,
top_k=40,
use_cache=False
)
def generate():
model.generate(**generation_kwargs)
thread = threading.Thread(target=generate)
thread.start()
print(f"User: {message}")
print("Assistant: ", end="", flush=True)
full_response = ""
for new_token in streamer:
if new_token:
print(new_token, end="", flush=True)
full_response += new_token
thread.join()
print()
return full_response
# Inference
user_message = "What is the meaning of life?"
response = generate_streaming(lora_model, tokenizer, user_message)```